diff --git a/go.mod b/go.mod index 1c30ec092..4ccdfecdf 100644 --- a/go.mod +++ b/go.mod @@ -84,6 +84,7 @@ require ( github.com/ipfs/go-log/v2 v2.5.1 github.com/jellydator/ttlcache/v3 v3.2.0 github.com/jmoiron/sqlx v1.3.5 + github.com/klauspost/reedsolomon v1.12.1 github.com/ladydascalie/currency v1.6.0 github.com/meirf/gopart v0.0.0-20180520194036-37e9492a85a8 github.com/mutecomm/go-sqlcipher/v4 v4.4.2 @@ -177,7 +178,7 @@ require ( github.com/jackpal/go-nat-pmp v1.0.2 // indirect github.com/jbenet/go-temp-err-catcher v0.1.0 // indirect github.com/klauspost/compress v1.16.7 // indirect - github.com/klauspost/cpuid/v2 v2.2.5 // indirect + github.com/klauspost/cpuid/v2 v2.2.7 // indirect github.com/koron/go-ssdp v0.0.4 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect @@ -279,7 +280,7 @@ require ( go.uber.org/fx v1.20.0 // indirect golang.org/x/mod v0.12.0 // indirect golang.org/x/sync v0.3.0 // indirect - golang.org/x/sys v0.11.0 // indirect + golang.org/x/sys v0.18.0 // indirect golang.org/x/term v0.11.0 // indirect golang.org/x/tools v0.12.1-0.20230818130535-1517d1a3ba60 // indirect golang.org/x/xerrors v0.0.0-20220609144429-65e65417b02f // indirect diff --git a/go.sum b/go.sum index 466cb1c7a..dde981db2 100644 --- a/go.sum +++ b/go.sum @@ -1295,10 +1295,12 @@ github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQs github.com/klauspost/cpuid v0.0.0-20170728055534-ae7887de9fa5/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= github.com/klauspost/cpuid/v2 v2.0.4/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= -github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg= -github.com/klauspost/cpuid/v2 v2.2.5/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= +github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM= +github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6/go.mod h1:+ZoRqAPRLkC4NPOvfYeR5KNOrY6TD+/sAC3HXPZgDYg= github.com/klauspost/pgzip v1.0.2-0.20170402124221-0bf5dcad4ada/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs= +github.com/klauspost/reedsolomon v1.12.1 h1:NhWgum1efX1x58daOBGCFWcxtEhOhXKKl1HAPQUp03Q= +github.com/klauspost/reedsolomon v1.12.1/go.mod h1:nEi5Kjb6QqtbofI6s+cbG/j1da11c96IBYBSnVGtuBs= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= @@ -2637,8 +2639,8 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM= -golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= +golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210220032956-6a3ed077a48d/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= diff --git a/vendor/github.com/klauspost/cpuid/v2/README.md b/vendor/github.com/klauspost/cpuid/v2/README.md index accd7abaf..30f8d2963 100644 --- a/vendor/github.com/klauspost/cpuid/v2/README.md +++ b/vendor/github.com/klauspost/cpuid/v2/README.md @@ -9,10 +9,7 @@ You can access the CPU information by accessing the shared CPU variable of the c Package home: https://github.com/klauspost/cpuid [![PkgGoDev](https://pkg.go.dev/badge/github.com/klauspost/cpuid)](https://pkg.go.dev/github.com/klauspost/cpuid/v2) -[![Build Status][3]][4] - -[3]: https://travis-ci.org/klauspost/cpuid.svg?branch=master -[4]: https://travis-ci.org/klauspost/cpuid +[![Go](https://github.com/klauspost/cpuid/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/cpuid/actions/workflows/go.yml) ## installing @@ -285,7 +282,12 @@ Exit Code 1 | AMXINT8 | Tile computational operations on 8-bit integers | | AMXFP16 | Tile computational operations on FP16 numbers | | AMXTILE | Tile architecture | +| APX_F | Intel APX | | AVX | AVX functions | +| AVX10 | If set the Intel AVX10 Converged Vector ISA is supported | +| AVX10_128 | If set indicates that AVX10 128-bit vector support is present | +| AVX10_256 | If set indicates that AVX10 256-bit vector support is present | +| AVX10_512 | If set indicates that AVX10 512-bit vector support is present | | AVX2 | AVX2 functions | | AVX512BF16 | AVX-512 BFLOAT16 Instructions | | AVX512BITALG | AVX-512 Bit Algorithms | @@ -365,6 +367,8 @@ Exit Code 1 | IDPRED_CTRL | IPRED_DIS | | INT_WBINVD | WBINVD/WBNOINVD are interruptible. | | INVLPGB | NVLPGB and TLBSYNC instruction supported | +| KEYLOCKER | Key locker | +| KEYLOCKERW | Key locker wide | | LAHF | LAHF/SAHF in long mode | | LAM | If set, CPU supports Linear Address Masking | | LBRVIRT | LBR virtualization | @@ -380,7 +384,7 @@ Exit Code 1 | MOVDIRI | Move Doubleword as Direct Store | | MOVSB_ZL | Fast Zero-Length MOVSB | | MPX | Intel MPX (Memory Protection Extensions) | -| MOVU | MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD | +| MOVU | MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD | | MSRIRC | Instruction Retired Counter MSR available | | MSRLIST | Read/Write List of Model Specific Registers | | MSR_PAGEFLUSH | Page Flush MSR available | diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid.go b/vendor/github.com/klauspost/cpuid/v2/cpuid.go index d015c744e..805f5e7b4 100644 --- a/vendor/github.com/klauspost/cpuid/v2/cpuid.go +++ b/vendor/github.com/klauspost/cpuid/v2/cpuid.go @@ -67,188 +67,200 @@ const ( // Keep index -1 as unknown UNKNOWN = -1 - // Add features - ADX FeatureID = iota // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) - AESNI // Advanced Encryption Standard New Instructions - AMD3DNOW // AMD 3DNOW - AMD3DNOWEXT // AMD 3DNowExt - AMXBF16 // Tile computational operations on BFLOAT16 numbers - AMXFP16 // Tile computational operations on FP16 numbers - AMXINT8 // Tile computational operations on 8-bit integers - AMXTILE // Tile architecture - AVX // AVX functions - AVX2 // AVX2 functions - AVX512BF16 // AVX-512 BFLOAT16 Instructions - AVX512BITALG // AVX-512 Bit Algorithms - AVX512BW // AVX-512 Byte and Word Instructions - AVX512CD // AVX-512 Conflict Detection Instructions - AVX512DQ // AVX-512 Doubleword and Quadword Instructions - AVX512ER // AVX-512 Exponential and Reciprocal Instructions - AVX512F // AVX-512 Foundation - AVX512FP16 // AVX-512 FP16 Instructions - AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions - AVX512PF // AVX-512 Prefetch Instructions - AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions - AVX512VBMI2 // AVX-512 Vector Bit Manipulation Instructions, Version 2 - AVX512VL // AVX-512 Vector Length Extensions - AVX512VNNI // AVX-512 Vector Neural Network Instructions - AVX512VP2INTERSECT // AVX-512 Intersect for D/Q - AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword - AVXIFMA // AVX-IFMA instructions - AVXNECONVERT // AVX-NE-CONVERT instructions - AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one - AVXVNNI // AVX (VEX encoded) VNNI neural network instructions - AVXVNNIINT8 // AVX-VNNI-INT8 instructions - BHI_CTRL // Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 - BMI1 // Bit Manipulation Instruction Set 1 - BMI2 // Bit Manipulation Instruction Set 2 - CETIBT // Intel CET Indirect Branch Tracking - CETSS // Intel CET Shadow Stack - CLDEMOTE // Cache Line Demote - CLMUL // Carry-less Multiplication - CLZERO // CLZERO instruction supported - CMOV // i686 CMOV - CMPCCXADD // CMPCCXADD instructions - CMPSB_SCADBS_SHORT // Fast short CMPSB and SCASB - CMPXCHG8 // CMPXCHG8 instruction - CPBOOST // Core Performance Boost - CPPC // AMD: Collaborative Processor Performance Control - CX16 // CMPXCHG16B Instruction - EFER_LMSLE_UNS // AMD: =Core::X86::Msr::EFER[LMSLE] is not supported, and MBZ - ENQCMD // Enqueue Command - ERMS // Enhanced REP MOVSB/STOSB - F16C // Half-precision floating-point conversion - FLUSH_L1D // Flush L1D cache - FMA3 // Intel FMA 3. Does not imply AVX. - FMA4 // Bulldozer FMA4 functions - FP128 // AMD: When set, the internal FP/SIMD execution datapath is no more than 128-bits wide - FP256 // AMD: When set, the internal FP/SIMD execution datapath is no more than 256-bits wide - FSRM // Fast Short Rep Mov - FXSR // FXSAVE, FXRESTOR instructions, CR4 bit 9 - FXSROPT // FXSAVE/FXRSTOR optimizations - GFNI // Galois Field New Instructions. May require other features (AVX, AVX512VL,AVX512F) based on usage. - HLE // Hardware Lock Elision - HRESET // If set CPU supports history reset and the IA32_HRESET_ENABLE MSR - HTT // Hyperthreading (enabled) - HWA // Hardware assert supported. Indicates support for MSRC001_10 - HYBRID_CPU // This part has CPUs of more than one type. - HYPERVISOR // This bit has been reserved by Intel & AMD for use by hypervisors - IA32_ARCH_CAP // IA32_ARCH_CAPABILITIES MSR (Intel) - IA32_CORE_CAP // IA32_CORE_CAPABILITIES MSR - IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB) - IBRS // AMD: Indirect Branch Restricted Speculation - IBRS_PREFERRED // AMD: IBRS is preferred over software solution - IBRS_PROVIDES_SMP // AMD: IBRS provides Same Mode Protection - IBS // Instruction Based Sampling (AMD) - IBSBRNTRGT // Instruction Based Sampling Feature (AMD) - IBSFETCHSAM // Instruction Based Sampling Feature (AMD) - IBSFFV // Instruction Based Sampling Feature (AMD) - IBSOPCNT // Instruction Based Sampling Feature (AMD) - IBSOPCNTEXT // Instruction Based Sampling Feature (AMD) - IBSOPSAM // Instruction Based Sampling Feature (AMD) - IBSRDWROPCNT // Instruction Based Sampling Feature (AMD) - IBSRIPINVALIDCHK // Instruction Based Sampling Feature (AMD) - IBS_FETCH_CTLX // AMD: IBS fetch control extended MSR supported - IBS_OPDATA4 // AMD: IBS op data 4 MSR supported - IBS_OPFUSE // AMD: Indicates support for IbsOpFuse - IBS_PREVENTHOST // Disallowing IBS use by the host supported - IBS_ZEN4 // AMD: Fetch and Op IBS support IBS extensions added with Zen4 - IDPRED_CTRL // IPRED_DIS - INT_WBINVD // WBINVD/WBNOINVD are interruptible. - INVLPGB // NVLPGB and TLBSYNC instruction supported - LAHF // LAHF/SAHF in long mode - LAM // If set, CPU supports Linear Address Masking - LBRVIRT // LBR virtualization - LZCNT // LZCNT instruction - MCAOVERFLOW // MCA overflow recovery support. - MCDT_NO // Processor do not exhibit MXCSR Configuration Dependent Timing behavior and do not need to mitigate it. - MCOMMIT // MCOMMIT instruction supported - MD_CLEAR // VERW clears CPU buffers - MMX // standard MMX - MMXEXT // SSE integer functions or AMD MMX ext - MOVBE // MOVBE instruction (big-endian) - MOVDIR64B // Move 64 Bytes as Direct Store - MOVDIRI // Move Doubleword as Direct Store - MOVSB_ZL // Fast Zero-Length MOVSB - MOVU // AMD: MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD - MPX // Intel MPX (Memory Protection Extensions) - MSRIRC // Instruction Retired Counter MSR available - MSRLIST // Read/Write List of Model Specific Registers - MSR_PAGEFLUSH // Page Flush MSR available - NRIPS // Indicates support for NRIP save on VMEXIT - NX // NX (No-Execute) bit - OSXSAVE // XSAVE enabled by OS - PCONFIG // PCONFIG for Intel Multi-Key Total Memory Encryption - POPCNT // POPCNT instruction - PPIN // AMD: Protected Processor Inventory Number support. Indicates that Protected Processor Inventory Number (PPIN) capability can be enabled - PREFETCHI // PREFETCHIT0/1 instructions - PSFD // Predictive Store Forward Disable - RDPRU // RDPRU instruction supported - RDRAND // RDRAND instruction is available - RDSEED // RDSEED instruction is available - RDTSCP // RDTSCP Instruction - RRSBA_CTRL // Restricted RSB Alternate - RTM // Restricted Transactional Memory - RTM_ALWAYS_ABORT // Indicates that the loaded microcode is forcing RTM abort. - SERIALIZE // Serialize Instruction Execution - SEV // AMD Secure Encrypted Virtualization supported - SEV_64BIT // AMD SEV guest execution only allowed from a 64-bit host - SEV_ALTERNATIVE // AMD SEV Alternate Injection supported - SEV_DEBUGSWAP // Full debug state swap supported for SEV-ES guests - SEV_ES // AMD SEV Encrypted State supported - SEV_RESTRICTED // AMD SEV Restricted Injection supported - SEV_SNP // AMD SEV Secure Nested Paging supported - SGX // Software Guard Extensions - SGXLC // Software Guard Extensions Launch Control - SHA // Intel SHA Extensions - SME // AMD Secure Memory Encryption supported - SME_COHERENT // AMD Hardware cache coherency across encryption domains enforced - SPEC_CTRL_SSBD // Speculative Store Bypass Disable - SRBDS_CTRL // SRBDS mitigation MSR available - SSE // SSE functions - SSE2 // P4 SSE functions - SSE3 // Prescott SSE3 functions - SSE4 // Penryn SSE4.1 functions - SSE42 // Nehalem SSE4.2 functions - SSE4A // AMD Barcelona microarchitecture SSE4a instructions - SSSE3 // Conroe SSSE3 functions - STIBP // Single Thread Indirect Branch Predictors - STIBP_ALWAYSON // AMD: Single Thread Indirect Branch Prediction Mode has Enhanced Performance and may be left Always On - STOSB_SHORT // Fast short STOSB - SUCCOR // Software uncorrectable error containment and recovery capability. - SVM // AMD Secure Virtual Machine - SVMDA // Indicates support for the SVM decode assists. - SVMFBASID // SVM, Indicates that TLB flush events, including CR3 writes and CR4.PGE toggles, flush only the current ASID's TLB entries. Also indicates support for the extended VMCBTLB_Control - SVML // AMD SVM lock. Indicates support for SVM-Lock. - SVMNP // AMD SVM nested paging - SVMPF // SVM pause intercept filter. Indicates support for the pause intercept filter - SVMPFT // SVM PAUSE filter threshold. Indicates support for the PAUSE filter cycle count threshold - SYSCALL // System-Call Extension (SCE): SYSCALL and SYSRET instructions. - SYSEE // SYSENTER and SYSEXIT instructions - TBM // AMD Trailing Bit Manipulation - TDX_GUEST // Intel Trust Domain Extensions Guest - TLB_FLUSH_NESTED // AMD: Flushing includes all the nested translations for guest translations - TME // Intel Total Memory Encryption. The following MSRs are supported: IA32_TME_CAPABILITY, IA32_TME_ACTIVATE, IA32_TME_EXCLUDE_MASK, and IA32_TME_EXCLUDE_BASE. - TOPEXT // TopologyExtensions: topology extensions support. Indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX. - TSCRATEMSR // MSR based TSC rate control. Indicates support for MSR TSC ratio MSRC000_0104 - TSXLDTRK // Intel TSX Suspend Load Address Tracking - VAES // Vector AES. AVX(512) versions requires additional checks. - VMCBCLEAN // VMCB clean bits. Indicates support for VMCB clean bits. - VMPL // AMD VM Permission Levels supported - VMSA_REGPROT // AMD VMSA Register Protection supported - VMX // Virtual Machine Extensions - VPCLMULQDQ // Carry-Less Multiplication Quadword. Requires AVX for 3 register versions. - VTE // AMD Virtual Transparent Encryption supported - WAITPKG // TPAUSE, UMONITOR, UMWAIT - WBNOINVD // Write Back and Do Not Invalidate Cache - WRMSRNS // Non-Serializing Write to Model Specific Register - X87 // FPU - XGETBV1 // Supports XGETBV with ECX = 1 - XOP // Bulldozer XOP functions - XSAVE // XSAVE, XRESTOR, XSETBV, XGETBV - XSAVEC // Supports XSAVEC and the compacted form of XRSTOR. - XSAVEOPT // XSAVEOPT available - XSAVES // Supports XSAVES/XRSTORS and IA32_XSS + // x86 features + ADX FeatureID = iota // Intel ADX (Multi-Precision Add-Carry Instruction Extensions) + AESNI // Advanced Encryption Standard New Instructions + AMD3DNOW // AMD 3DNOW + AMD3DNOWEXT // AMD 3DNowExt + AMXBF16 // Tile computational operations on BFLOAT16 numbers + AMXFP16 // Tile computational operations on FP16 numbers + AMXINT8 // Tile computational operations on 8-bit integers + AMXTILE // Tile architecture + APX_F // Intel APX + AVX // AVX functions + AVX10 // If set the Intel AVX10 Converged Vector ISA is supported + AVX10_128 // If set indicates that AVX10 128-bit vector support is present + AVX10_256 // If set indicates that AVX10 256-bit vector support is present + AVX10_512 // If set indicates that AVX10 512-bit vector support is present + AVX2 // AVX2 functions + AVX512BF16 // AVX-512 BFLOAT16 Instructions + AVX512BITALG // AVX-512 Bit Algorithms + AVX512BW // AVX-512 Byte and Word Instructions + AVX512CD // AVX-512 Conflict Detection Instructions + AVX512DQ // AVX-512 Doubleword and Quadword Instructions + AVX512ER // AVX-512 Exponential and Reciprocal Instructions + AVX512F // AVX-512 Foundation + AVX512FP16 // AVX-512 FP16 Instructions + AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions + AVX512PF // AVX-512 Prefetch Instructions + AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions + AVX512VBMI2 // AVX-512 Vector Bit Manipulation Instructions, Version 2 + AVX512VL // AVX-512 Vector Length Extensions + AVX512VNNI // AVX-512 Vector Neural Network Instructions + AVX512VP2INTERSECT // AVX-512 Intersect for D/Q + AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword + AVXIFMA // AVX-IFMA instructions + AVXNECONVERT // AVX-NE-CONVERT instructions + AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one + AVXVNNI // AVX (VEX encoded) VNNI neural network instructions + AVXVNNIINT8 // AVX-VNNI-INT8 instructions + BHI_CTRL // Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 + BMI1 // Bit Manipulation Instruction Set 1 + BMI2 // Bit Manipulation Instruction Set 2 + CETIBT // Intel CET Indirect Branch Tracking + CETSS // Intel CET Shadow Stack + CLDEMOTE // Cache Line Demote + CLMUL // Carry-less Multiplication + CLZERO // CLZERO instruction supported + CMOV // i686 CMOV + CMPCCXADD // CMPCCXADD instructions + CMPSB_SCADBS_SHORT // Fast short CMPSB and SCASB + CMPXCHG8 // CMPXCHG8 instruction + CPBOOST // Core Performance Boost + CPPC // AMD: Collaborative Processor Performance Control + CX16 // CMPXCHG16B Instruction + EFER_LMSLE_UNS // AMD: =Core::X86::Msr::EFER[LMSLE] is not supported, and MBZ + ENQCMD // Enqueue Command + ERMS // Enhanced REP MOVSB/STOSB + F16C // Half-precision floating-point conversion + FLUSH_L1D // Flush L1D cache + FMA3 // Intel FMA 3. Does not imply AVX. + FMA4 // Bulldozer FMA4 functions + FP128 // AMD: When set, the internal FP/SIMD execution datapath is no more than 128-bits wide + FP256 // AMD: When set, the internal FP/SIMD execution datapath is no more than 256-bits wide + FSRM // Fast Short Rep Mov + FXSR // FXSAVE, FXRESTOR instructions, CR4 bit 9 + FXSROPT // FXSAVE/FXRSTOR optimizations + GFNI // Galois Field New Instructions. May require other features (AVX, AVX512VL,AVX512F) based on usage. + HLE // Hardware Lock Elision + HRESET // If set CPU supports history reset and the IA32_HRESET_ENABLE MSR + HTT // Hyperthreading (enabled) + HWA // Hardware assert supported. Indicates support for MSRC001_10 + HYBRID_CPU // This part has CPUs of more than one type. + HYPERVISOR // This bit has been reserved by Intel & AMD for use by hypervisors + IA32_ARCH_CAP // IA32_ARCH_CAPABILITIES MSR (Intel) + IA32_CORE_CAP // IA32_CORE_CAPABILITIES MSR + IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB) + IBPB_BRTYPE // Indicates that MSR 49h (PRED_CMD) bit 0 (IBPB) flushes all branch type predictions from the CPU branch predictor + IBRS // AMD: Indirect Branch Restricted Speculation + IBRS_PREFERRED // AMD: IBRS is preferred over software solution + IBRS_PROVIDES_SMP // AMD: IBRS provides Same Mode Protection + IBS // Instruction Based Sampling (AMD) + IBSBRNTRGT // Instruction Based Sampling Feature (AMD) + IBSFETCHSAM // Instruction Based Sampling Feature (AMD) + IBSFFV // Instruction Based Sampling Feature (AMD) + IBSOPCNT // Instruction Based Sampling Feature (AMD) + IBSOPCNTEXT // Instruction Based Sampling Feature (AMD) + IBSOPSAM // Instruction Based Sampling Feature (AMD) + IBSRDWROPCNT // Instruction Based Sampling Feature (AMD) + IBSRIPINVALIDCHK // Instruction Based Sampling Feature (AMD) + IBS_FETCH_CTLX // AMD: IBS fetch control extended MSR supported + IBS_OPDATA4 // AMD: IBS op data 4 MSR supported + IBS_OPFUSE // AMD: Indicates support for IbsOpFuse + IBS_PREVENTHOST // Disallowing IBS use by the host supported + IBS_ZEN4 // AMD: Fetch and Op IBS support IBS extensions added with Zen4 + IDPRED_CTRL // IPRED_DIS + INT_WBINVD // WBINVD/WBNOINVD are interruptible. + INVLPGB // NVLPGB and TLBSYNC instruction supported + KEYLOCKER // Key locker + KEYLOCKERW // Key locker wide + LAHF // LAHF/SAHF in long mode + LAM // If set, CPU supports Linear Address Masking + LBRVIRT // LBR virtualization + LZCNT // LZCNT instruction + MCAOVERFLOW // MCA overflow recovery support. + MCDT_NO // Processor do not exhibit MXCSR Configuration Dependent Timing behavior and do not need to mitigate it. + MCOMMIT // MCOMMIT instruction supported + MD_CLEAR // VERW clears CPU buffers + MMX // standard MMX + MMXEXT // SSE integer functions or AMD MMX ext + MOVBE // MOVBE instruction (big-endian) + MOVDIR64B // Move 64 Bytes as Direct Store + MOVDIRI // Move Doubleword as Direct Store + MOVSB_ZL // Fast Zero-Length MOVSB + MOVU // AMD: MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD + MPX // Intel MPX (Memory Protection Extensions) + MSRIRC // Instruction Retired Counter MSR available + MSRLIST // Read/Write List of Model Specific Registers + MSR_PAGEFLUSH // Page Flush MSR available + NRIPS // Indicates support for NRIP save on VMEXIT + NX // NX (No-Execute) bit + OSXSAVE // XSAVE enabled by OS + PCONFIG // PCONFIG for Intel Multi-Key Total Memory Encryption + POPCNT // POPCNT instruction + PPIN // AMD: Protected Processor Inventory Number support. Indicates that Protected Processor Inventory Number (PPIN) capability can be enabled + PREFETCHI // PREFETCHIT0/1 instructions + PSFD // Predictive Store Forward Disable + RDPRU // RDPRU instruction supported + RDRAND // RDRAND instruction is available + RDSEED // RDSEED instruction is available + RDTSCP // RDTSCP Instruction + RRSBA_CTRL // Restricted RSB Alternate + RTM // Restricted Transactional Memory + RTM_ALWAYS_ABORT // Indicates that the loaded microcode is forcing RTM abort. + SBPB // Indicates support for the Selective Branch Predictor Barrier + SERIALIZE // Serialize Instruction Execution + SEV // AMD Secure Encrypted Virtualization supported + SEV_64BIT // AMD SEV guest execution only allowed from a 64-bit host + SEV_ALTERNATIVE // AMD SEV Alternate Injection supported + SEV_DEBUGSWAP // Full debug state swap supported for SEV-ES guests + SEV_ES // AMD SEV Encrypted State supported + SEV_RESTRICTED // AMD SEV Restricted Injection supported + SEV_SNP // AMD SEV Secure Nested Paging supported + SGX // Software Guard Extensions + SGXLC // Software Guard Extensions Launch Control + SHA // Intel SHA Extensions + SME // AMD Secure Memory Encryption supported + SME_COHERENT // AMD Hardware cache coherency across encryption domains enforced + SPEC_CTRL_SSBD // Speculative Store Bypass Disable + SRBDS_CTRL // SRBDS mitigation MSR available + SRSO_MSR_FIX // Indicates that software may use MSR BP_CFG[BpSpecReduce] to mitigate SRSO. + SRSO_NO // Indicates the CPU is not subject to the SRSO vulnerability + SRSO_USER_KERNEL_NO // Indicates the CPU is not subject to the SRSO vulnerability across user/kernel boundaries + SSE // SSE functions + SSE2 // P4 SSE functions + SSE3 // Prescott SSE3 functions + SSE4 // Penryn SSE4.1 functions + SSE42 // Nehalem SSE4.2 functions + SSE4A // AMD Barcelona microarchitecture SSE4a instructions + SSSE3 // Conroe SSSE3 functions + STIBP // Single Thread Indirect Branch Predictors + STIBP_ALWAYSON // AMD: Single Thread Indirect Branch Prediction Mode has Enhanced Performance and may be left Always On + STOSB_SHORT // Fast short STOSB + SUCCOR // Software uncorrectable error containment and recovery capability. + SVM // AMD Secure Virtual Machine + SVMDA // Indicates support for the SVM decode assists. + SVMFBASID // SVM, Indicates that TLB flush events, including CR3 writes and CR4.PGE toggles, flush only the current ASID's TLB entries. Also indicates support for the extended VMCBTLB_Control + SVML // AMD SVM lock. Indicates support for SVM-Lock. + SVMNP // AMD SVM nested paging + SVMPF // SVM pause intercept filter. Indicates support for the pause intercept filter + SVMPFT // SVM PAUSE filter threshold. Indicates support for the PAUSE filter cycle count threshold + SYSCALL // System-Call Extension (SCE): SYSCALL and SYSRET instructions. + SYSEE // SYSENTER and SYSEXIT instructions + TBM // AMD Trailing Bit Manipulation + TDX_GUEST // Intel Trust Domain Extensions Guest + TLB_FLUSH_NESTED // AMD: Flushing includes all the nested translations for guest translations + TME // Intel Total Memory Encryption. The following MSRs are supported: IA32_TME_CAPABILITY, IA32_TME_ACTIVATE, IA32_TME_EXCLUDE_MASK, and IA32_TME_EXCLUDE_BASE. + TOPEXT // TopologyExtensions: topology extensions support. Indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX. + TSCRATEMSR // MSR based TSC rate control. Indicates support for MSR TSC ratio MSRC000_0104 + TSXLDTRK // Intel TSX Suspend Load Address Tracking + VAES // Vector AES. AVX(512) versions requires additional checks. + VMCBCLEAN // VMCB clean bits. Indicates support for VMCB clean bits. + VMPL // AMD VM Permission Levels supported + VMSA_REGPROT // AMD VMSA Register Protection supported + VMX // Virtual Machine Extensions + VPCLMULQDQ // Carry-Less Multiplication Quadword. Requires AVX for 3 register versions. + VTE // AMD Virtual Transparent Encryption supported + WAITPKG // TPAUSE, UMONITOR, UMWAIT + WBNOINVD // Write Back and Do Not Invalidate Cache + WRMSRNS // Non-Serializing Write to Model Specific Register + X87 // FPU + XGETBV1 // Supports XGETBV with ECX = 1 + XOP // Bulldozer XOP functions + XSAVE // XSAVE, XRESTOR, XSETBV, XGETBV + XSAVEC // Supports XSAVEC and the compacted form of XRSTOR. + XSAVEOPT // XSAVEOPT available + XSAVES // Supports XSAVES/XRSTORS and IA32_XSS // ARM features: AESARM // AES instructions @@ -302,9 +314,11 @@ type CPUInfo struct { L2 int // L2 Cache (per core or shared). Will be -1 if undetected L3 int // L3 Cache (per core, per ccx or shared). Will be -1 if undetected } - SGX SGXSupport - maxFunc uint32 - maxExFunc uint32 + SGX SGXSupport + AMDMemEncryption AMDMemEncryptionSupport + AVX10Level uint8 + maxFunc uint32 + maxExFunc uint32 } var cpuid func(op uint32) (eax, ebx, ecx, edx uint32) @@ -1071,6 +1085,32 @@ func hasSGX(available, lc bool) (rval SGXSupport) { return } +type AMDMemEncryptionSupport struct { + Available bool + CBitPossition uint32 + NumVMPL uint32 + PhysAddrReduction uint32 + NumEntryptedGuests uint32 + MinSevNoEsAsid uint32 +} + +func hasAMDMemEncryption(available bool) (rval AMDMemEncryptionSupport) { + rval.Available = available + if !available { + return + } + + _, b, c, d := cpuidex(0x8000001f, 0) + + rval.CBitPossition = b & 0x3f + rval.PhysAddrReduction = (b >> 6) & 0x3F + rval.NumVMPL = (b >> 12) & 0xf + rval.NumEntryptedGuests = c + rval.MinSevNoEsAsid = d + + return +} + func support() flagSet { var fs flagSet mfi := maxFunctionID() @@ -1165,6 +1205,7 @@ func support() flagSet { fs.setIf(ecx&(1<<10) != 0, VPCLMULQDQ) fs.setIf(ecx&(1<<13) != 0, TME) fs.setIf(ecx&(1<<25) != 0, CLDEMOTE) + fs.setIf(ecx&(1<<23) != 0, KEYLOCKER) fs.setIf(ecx&(1<<27) != 0, MOVDIRI) fs.setIf(ecx&(1<<28) != 0, MOVDIR64B) fs.setIf(ecx&(1<<29) != 0, ENQCMD) @@ -1202,6 +1243,8 @@ func support() flagSet { fs.setIf(edx1&(1<<4) != 0, AVXVNNIINT8) fs.setIf(edx1&(1<<5) != 0, AVXNECONVERT) fs.setIf(edx1&(1<<14) != 0, PREFETCHI) + fs.setIf(edx1&(1<<19) != 0, AVX10) + fs.setIf(edx1&(1<<21) != 0, APX_F) // Only detect AVX-512 features if XGETBV is supported if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) { @@ -1252,6 +1295,19 @@ func support() flagSet { fs.setIf(edx&(1<<4) != 0, BHI_CTRL) fs.setIf(edx&(1<<5) != 0, MCDT_NO) + // Add keylocker features. + if fs.inSet(KEYLOCKER) && mfi >= 0x19 { + _, ebx, _, _ := cpuidex(0x19, 0) + fs.setIf(ebx&5 == 5, KEYLOCKERW) // Bit 0 and 2 (1+4) + } + + // Add AVX10 features. + if fs.inSet(AVX10) && mfi >= 0x24 { + _, ebx, _, _ := cpuidex(0x24, 0) + fs.setIf(ebx&(1<<16) != 0, AVX10_128) + fs.setIf(ebx&(1<<17) != 0, AVX10_256) + fs.setIf(ebx&(1<<18) != 0, AVX10_512) + } } // Processor Extended State Enumeration Sub-leaf (EAX = 0DH, ECX = 1) @@ -1394,6 +1450,29 @@ func support() flagSet { fs.setIf((a>>24)&1 == 1, VMSA_REGPROT) } + if maxExtendedFunction() >= 0x80000021 && vend == AMD { + a, _, _, _ := cpuid(0x80000021) + fs.setIf((a>>31)&1 == 1, SRSO_MSR_FIX) + fs.setIf((a>>30)&1 == 1, SRSO_USER_KERNEL_NO) + fs.setIf((a>>29)&1 == 1, SRSO_NO) + fs.setIf((a>>28)&1 == 1, IBPB_BRTYPE) + fs.setIf((a>>27)&1 == 1, SBPB) + } + + if mfi >= 0x20 { + // Microsoft has decided to purposefully hide the information + // of the guest TEE when VMs are being created using Hyper-V. + // + // This leads us to check for the Hyper-V cpuid features + // (0x4000000C), and then for the `ebx` value set. + // + // For Intel TDX, `ebx` is set as `0xbe3`, being 3 the part + // we're mostly interested about,according to: + // https://github.com/torvalds/linux/blob/d2f51b3516dade79269ff45eae2a7668ae711b25/arch/x86/include/asm/hyperv-tlfs.h#L169-L174 + _, ebx, _, _ := cpuid(0x4000000C) + fs.setIf(ebx == 0xbe3, TDX_GUEST) + } + if mfi >= 0x21 { // Intel Trusted Domain Extensions Guests have their own cpuid leaf (0x21). _, ebx, ecx, edx := cpuid(0x21) @@ -1404,6 +1483,14 @@ func support() flagSet { return fs } +func (c *CPUInfo) supportAVX10() uint8 { + if c.maxFunc >= 0x24 && c.featureSet.inSet(AVX10) { + _, ebx, _, _ := cpuidex(0x24, 0) + return uint8(ebx) + } + return 0 +} + func valAsString(values ...uint32) []byte { r := make([]byte, 4*len(values)) for i, v := range values { diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go index c946824ec..799b400c2 100644 --- a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go +++ b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go @@ -27,10 +27,12 @@ func addInfo(c *CPUInfo, safe bool) { c.Family, c.Model, c.Stepping = familyModel() c.featureSet = support() c.SGX = hasSGX(c.featureSet.inSet(SGX), c.featureSet.inSet(SGXLC)) + c.AMDMemEncryption = hasAMDMemEncryption(c.featureSet.inSet(SME) || c.featureSet.inSet(SEV)) c.ThreadsPerCore = threadsPerCore() c.LogicalCores = logicalCores() c.PhysicalCores = physicalCores() c.VendorID, c.VendorString = vendorID() + c.AVX10Level = c.supportAVX10() c.cacheSize() c.frequencies() } diff --git a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go index 024c706af..57a085a53 100644 --- a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go +++ b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go @@ -16,210 +16,222 @@ func _() { _ = x[AMXFP16-6] _ = x[AMXINT8-7] _ = x[AMXTILE-8] - _ = x[AVX-9] - _ = x[AVX2-10] - _ = x[AVX512BF16-11] - _ = x[AVX512BITALG-12] - _ = x[AVX512BW-13] - _ = x[AVX512CD-14] - _ = x[AVX512DQ-15] - _ = x[AVX512ER-16] - _ = x[AVX512F-17] - _ = x[AVX512FP16-18] - _ = x[AVX512IFMA-19] - _ = x[AVX512PF-20] - _ = x[AVX512VBMI-21] - _ = x[AVX512VBMI2-22] - _ = x[AVX512VL-23] - _ = x[AVX512VNNI-24] - _ = x[AVX512VP2INTERSECT-25] - _ = x[AVX512VPOPCNTDQ-26] - _ = x[AVXIFMA-27] - _ = x[AVXNECONVERT-28] - _ = x[AVXSLOW-29] - _ = x[AVXVNNI-30] - _ = x[AVXVNNIINT8-31] - _ = x[BHI_CTRL-32] - _ = x[BMI1-33] - _ = x[BMI2-34] - _ = x[CETIBT-35] - _ = x[CETSS-36] - _ = x[CLDEMOTE-37] - _ = x[CLMUL-38] - _ = x[CLZERO-39] - _ = x[CMOV-40] - _ = x[CMPCCXADD-41] - _ = x[CMPSB_SCADBS_SHORT-42] - _ = x[CMPXCHG8-43] - _ = x[CPBOOST-44] - _ = x[CPPC-45] - _ = x[CX16-46] - _ = x[EFER_LMSLE_UNS-47] - _ = x[ENQCMD-48] - _ = x[ERMS-49] - _ = x[F16C-50] - _ = x[FLUSH_L1D-51] - _ = x[FMA3-52] - _ = x[FMA4-53] - _ = x[FP128-54] - _ = x[FP256-55] - _ = x[FSRM-56] - _ = x[FXSR-57] - _ = x[FXSROPT-58] - _ = x[GFNI-59] - _ = x[HLE-60] - _ = x[HRESET-61] - _ = x[HTT-62] - _ = x[HWA-63] - _ = x[HYBRID_CPU-64] - _ = x[HYPERVISOR-65] - _ = x[IA32_ARCH_CAP-66] - _ = x[IA32_CORE_CAP-67] - _ = x[IBPB-68] - _ = x[IBRS-69] - _ = x[IBRS_PREFERRED-70] - _ = x[IBRS_PROVIDES_SMP-71] - _ = x[IBS-72] - _ = x[IBSBRNTRGT-73] - _ = x[IBSFETCHSAM-74] - _ = x[IBSFFV-75] - _ = x[IBSOPCNT-76] - _ = x[IBSOPCNTEXT-77] - _ = x[IBSOPSAM-78] - _ = x[IBSRDWROPCNT-79] - _ = x[IBSRIPINVALIDCHK-80] - _ = x[IBS_FETCH_CTLX-81] - _ = x[IBS_OPDATA4-82] - _ = x[IBS_OPFUSE-83] - _ = x[IBS_PREVENTHOST-84] - _ = x[IBS_ZEN4-85] - _ = x[IDPRED_CTRL-86] - _ = x[INT_WBINVD-87] - _ = x[INVLPGB-88] - _ = x[LAHF-89] - _ = x[LAM-90] - _ = x[LBRVIRT-91] - _ = x[LZCNT-92] - _ = x[MCAOVERFLOW-93] - _ = x[MCDT_NO-94] - _ = x[MCOMMIT-95] - _ = x[MD_CLEAR-96] - _ = x[MMX-97] - _ = x[MMXEXT-98] - _ = x[MOVBE-99] - _ = x[MOVDIR64B-100] - _ = x[MOVDIRI-101] - _ = x[MOVSB_ZL-102] - _ = x[MOVU-103] - _ = x[MPX-104] - _ = x[MSRIRC-105] - _ = x[MSRLIST-106] - _ = x[MSR_PAGEFLUSH-107] - _ = x[NRIPS-108] - _ = x[NX-109] - _ = x[OSXSAVE-110] - _ = x[PCONFIG-111] - _ = x[POPCNT-112] - _ = x[PPIN-113] - _ = x[PREFETCHI-114] - _ = x[PSFD-115] - _ = x[RDPRU-116] - _ = x[RDRAND-117] - _ = x[RDSEED-118] - _ = x[RDTSCP-119] - _ = x[RRSBA_CTRL-120] - _ = x[RTM-121] - _ = x[RTM_ALWAYS_ABORT-122] - _ = x[SERIALIZE-123] - _ = x[SEV-124] - _ = x[SEV_64BIT-125] - _ = x[SEV_ALTERNATIVE-126] - _ = x[SEV_DEBUGSWAP-127] - _ = x[SEV_ES-128] - _ = x[SEV_RESTRICTED-129] - _ = x[SEV_SNP-130] - _ = x[SGX-131] - _ = x[SGXLC-132] - _ = x[SHA-133] - _ = x[SME-134] - _ = x[SME_COHERENT-135] - _ = x[SPEC_CTRL_SSBD-136] - _ = x[SRBDS_CTRL-137] - _ = x[SSE-138] - _ = x[SSE2-139] - _ = x[SSE3-140] - _ = x[SSE4-141] - _ = x[SSE42-142] - _ = x[SSE4A-143] - _ = x[SSSE3-144] - _ = x[STIBP-145] - _ = x[STIBP_ALWAYSON-146] - _ = x[STOSB_SHORT-147] - _ = x[SUCCOR-148] - _ = x[SVM-149] - _ = x[SVMDA-150] - _ = x[SVMFBASID-151] - _ = x[SVML-152] - _ = x[SVMNP-153] - _ = x[SVMPF-154] - _ = x[SVMPFT-155] - _ = x[SYSCALL-156] - _ = x[SYSEE-157] - _ = x[TBM-158] - _ = x[TDX_GUEST-159] - _ = x[TLB_FLUSH_NESTED-160] - _ = x[TME-161] - _ = x[TOPEXT-162] - _ = x[TSCRATEMSR-163] - _ = x[TSXLDTRK-164] - _ = x[VAES-165] - _ = x[VMCBCLEAN-166] - _ = x[VMPL-167] - _ = x[VMSA_REGPROT-168] - _ = x[VMX-169] - _ = x[VPCLMULQDQ-170] - _ = x[VTE-171] - _ = x[WAITPKG-172] - _ = x[WBNOINVD-173] - _ = x[WRMSRNS-174] - _ = x[X87-175] - _ = x[XGETBV1-176] - _ = x[XOP-177] - _ = x[XSAVE-178] - _ = x[XSAVEC-179] - _ = x[XSAVEOPT-180] - _ = x[XSAVES-181] - _ = x[AESARM-182] - _ = x[ARMCPUID-183] - _ = x[ASIMD-184] - _ = x[ASIMDDP-185] - _ = x[ASIMDHP-186] - _ = x[ASIMDRDM-187] - _ = x[ATOMICS-188] - _ = x[CRC32-189] - _ = x[DCPOP-190] - _ = x[EVTSTRM-191] - _ = x[FCMA-192] - _ = x[FP-193] - _ = x[FPHP-194] - _ = x[GPA-195] - _ = x[JSCVT-196] - _ = x[LRCPC-197] - _ = x[PMULL-198] - _ = x[SHA1-199] - _ = x[SHA2-200] - _ = x[SHA3-201] - _ = x[SHA512-202] - _ = x[SM3-203] - _ = x[SM4-204] - _ = x[SVE-205] - _ = x[lastID-206] + _ = x[APX_F-9] + _ = x[AVX-10] + _ = x[AVX10-11] + _ = x[AVX10_128-12] + _ = x[AVX10_256-13] + _ = x[AVX10_512-14] + _ = x[AVX2-15] + _ = x[AVX512BF16-16] + _ = x[AVX512BITALG-17] + _ = x[AVX512BW-18] + _ = x[AVX512CD-19] + _ = x[AVX512DQ-20] + _ = x[AVX512ER-21] + _ = x[AVX512F-22] + _ = x[AVX512FP16-23] + _ = x[AVX512IFMA-24] + _ = x[AVX512PF-25] + _ = x[AVX512VBMI-26] + _ = x[AVX512VBMI2-27] + _ = x[AVX512VL-28] + _ = x[AVX512VNNI-29] + _ = x[AVX512VP2INTERSECT-30] + _ = x[AVX512VPOPCNTDQ-31] + _ = x[AVXIFMA-32] + _ = x[AVXNECONVERT-33] + _ = x[AVXSLOW-34] + _ = x[AVXVNNI-35] + _ = x[AVXVNNIINT8-36] + _ = x[BHI_CTRL-37] + _ = x[BMI1-38] + _ = x[BMI2-39] + _ = x[CETIBT-40] + _ = x[CETSS-41] + _ = x[CLDEMOTE-42] + _ = x[CLMUL-43] + _ = x[CLZERO-44] + _ = x[CMOV-45] + _ = x[CMPCCXADD-46] + _ = x[CMPSB_SCADBS_SHORT-47] + _ = x[CMPXCHG8-48] + _ = x[CPBOOST-49] + _ = x[CPPC-50] + _ = x[CX16-51] + _ = x[EFER_LMSLE_UNS-52] + _ = x[ENQCMD-53] + _ = x[ERMS-54] + _ = x[F16C-55] + _ = x[FLUSH_L1D-56] + _ = x[FMA3-57] + _ = x[FMA4-58] + _ = x[FP128-59] + _ = x[FP256-60] + _ = x[FSRM-61] + _ = x[FXSR-62] + _ = x[FXSROPT-63] + _ = x[GFNI-64] + _ = x[HLE-65] + _ = x[HRESET-66] + _ = x[HTT-67] + _ = x[HWA-68] + _ = x[HYBRID_CPU-69] + _ = x[HYPERVISOR-70] + _ = x[IA32_ARCH_CAP-71] + _ = x[IA32_CORE_CAP-72] + _ = x[IBPB-73] + _ = x[IBPB_BRTYPE-74] + _ = x[IBRS-75] + _ = x[IBRS_PREFERRED-76] + _ = x[IBRS_PROVIDES_SMP-77] + _ = x[IBS-78] + _ = x[IBSBRNTRGT-79] + _ = x[IBSFETCHSAM-80] + _ = x[IBSFFV-81] + _ = x[IBSOPCNT-82] + _ = x[IBSOPCNTEXT-83] + _ = x[IBSOPSAM-84] + _ = x[IBSRDWROPCNT-85] + _ = x[IBSRIPINVALIDCHK-86] + _ = x[IBS_FETCH_CTLX-87] + _ = x[IBS_OPDATA4-88] + _ = x[IBS_OPFUSE-89] + _ = x[IBS_PREVENTHOST-90] + _ = x[IBS_ZEN4-91] + _ = x[IDPRED_CTRL-92] + _ = x[INT_WBINVD-93] + _ = x[INVLPGB-94] + _ = x[KEYLOCKER-95] + _ = x[KEYLOCKERW-96] + _ = x[LAHF-97] + _ = x[LAM-98] + _ = x[LBRVIRT-99] + _ = x[LZCNT-100] + _ = x[MCAOVERFLOW-101] + _ = x[MCDT_NO-102] + _ = x[MCOMMIT-103] + _ = x[MD_CLEAR-104] + _ = x[MMX-105] + _ = x[MMXEXT-106] + _ = x[MOVBE-107] + _ = x[MOVDIR64B-108] + _ = x[MOVDIRI-109] + _ = x[MOVSB_ZL-110] + _ = x[MOVU-111] + _ = x[MPX-112] + _ = x[MSRIRC-113] + _ = x[MSRLIST-114] + _ = x[MSR_PAGEFLUSH-115] + _ = x[NRIPS-116] + _ = x[NX-117] + _ = x[OSXSAVE-118] + _ = x[PCONFIG-119] + _ = x[POPCNT-120] + _ = x[PPIN-121] + _ = x[PREFETCHI-122] + _ = x[PSFD-123] + _ = x[RDPRU-124] + _ = x[RDRAND-125] + _ = x[RDSEED-126] + _ = x[RDTSCP-127] + _ = x[RRSBA_CTRL-128] + _ = x[RTM-129] + _ = x[RTM_ALWAYS_ABORT-130] + _ = x[SBPB-131] + _ = x[SERIALIZE-132] + _ = x[SEV-133] + _ = x[SEV_64BIT-134] + _ = x[SEV_ALTERNATIVE-135] + _ = x[SEV_DEBUGSWAP-136] + _ = x[SEV_ES-137] + _ = x[SEV_RESTRICTED-138] + _ = x[SEV_SNP-139] + _ = x[SGX-140] + _ = x[SGXLC-141] + _ = x[SHA-142] + _ = x[SME-143] + _ = x[SME_COHERENT-144] + _ = x[SPEC_CTRL_SSBD-145] + _ = x[SRBDS_CTRL-146] + _ = x[SRSO_MSR_FIX-147] + _ = x[SRSO_NO-148] + _ = x[SRSO_USER_KERNEL_NO-149] + _ = x[SSE-150] + _ = x[SSE2-151] + _ = x[SSE3-152] + _ = x[SSE4-153] + _ = x[SSE42-154] + _ = x[SSE4A-155] + _ = x[SSSE3-156] + _ = x[STIBP-157] + _ = x[STIBP_ALWAYSON-158] + _ = x[STOSB_SHORT-159] + _ = x[SUCCOR-160] + _ = x[SVM-161] + _ = x[SVMDA-162] + _ = x[SVMFBASID-163] + _ = x[SVML-164] + _ = x[SVMNP-165] + _ = x[SVMPF-166] + _ = x[SVMPFT-167] + _ = x[SYSCALL-168] + _ = x[SYSEE-169] + _ = x[TBM-170] + _ = x[TDX_GUEST-171] + _ = x[TLB_FLUSH_NESTED-172] + _ = x[TME-173] + _ = x[TOPEXT-174] + _ = x[TSCRATEMSR-175] + _ = x[TSXLDTRK-176] + _ = x[VAES-177] + _ = x[VMCBCLEAN-178] + _ = x[VMPL-179] + _ = x[VMSA_REGPROT-180] + _ = x[VMX-181] + _ = x[VPCLMULQDQ-182] + _ = x[VTE-183] + _ = x[WAITPKG-184] + _ = x[WBNOINVD-185] + _ = x[WRMSRNS-186] + _ = x[X87-187] + _ = x[XGETBV1-188] + _ = x[XOP-189] + _ = x[XSAVE-190] + _ = x[XSAVEC-191] + _ = x[XSAVEOPT-192] + _ = x[XSAVES-193] + _ = x[AESARM-194] + _ = x[ARMCPUID-195] + _ = x[ASIMD-196] + _ = x[ASIMDDP-197] + _ = x[ASIMDHP-198] + _ = x[ASIMDRDM-199] + _ = x[ATOMICS-200] + _ = x[CRC32-201] + _ = x[DCPOP-202] + _ = x[EVTSTRM-203] + _ = x[FCMA-204] + _ = x[FP-205] + _ = x[FPHP-206] + _ = x[GPA-207] + _ = x[JSCVT-208] + _ = x[LRCPC-209] + _ = x[PMULL-210] + _ = x[SHA1-211] + _ = x[SHA2-212] + _ = x[SHA3-213] + _ = x[SHA512-214] + _ = x[SM3-215] + _ = x[SM4-216] + _ = x[SVE-217] + _ = x[lastID-218] _ = x[firstID-0] } -const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXFP16AMXINT8AMXTILEAVXAVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXIFMAAVXNECONVERTAVXSLOWAVXVNNIAVXVNNIINT8BHI_CTRLBMI1BMI2CETIBTCETSSCLDEMOTECLMULCLZEROCMOVCMPCCXADDCMPSB_SCADBS_SHORTCMPXCHG8CPBOOSTCPPCCX16EFER_LMSLE_UNSENQCMDERMSF16CFLUSH_L1DFMA3FMA4FP128FP256FSRMFXSRFXSROPTGFNIHLEHRESETHTTHWAHYBRID_CPUHYPERVISORIA32_ARCH_CAPIA32_CORE_CAPIBPBIBRSIBRS_PREFERREDIBRS_PROVIDES_SMPIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKIBS_FETCH_CTLXIBS_OPDATA4IBS_OPFUSEIBS_PREVENTHOSTIBS_ZEN4IDPRED_CTRLINT_WBINVDINVLPGBLAHFLAMLBRVIRTLZCNTMCAOVERFLOWMCDT_NOMCOMMITMD_CLEARMMXMMXEXTMOVBEMOVDIR64BMOVDIRIMOVSB_ZLMOVUMPXMSRIRCMSRLISTMSR_PAGEFLUSHNRIPSNXOSXSAVEPCONFIGPOPCNTPPINPREFETCHIPSFDRDPRURDRANDRDSEEDRDTSCPRRSBA_CTRLRTMRTM_ALWAYS_ABORTSERIALIZESEVSEV_64BITSEV_ALTERNATIVESEV_DEBUGSWAPSEV_ESSEV_RESTRICTEDSEV_SNPSGXSGXLCSHASMESME_COHERENTSPEC_CTRL_SSBDSRBDS_CTRLSSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSTIBP_ALWAYSONSTOSB_SHORTSUCCORSVMSVMDASVMFBASIDSVMLSVMNPSVMPFSVMPFTSYSCALLSYSEETBMTDX_GUESTTLB_FLUSH_NESTEDTMETOPEXTTSCRATEMSRTSXLDTRKVAESVMCBCLEANVMPLVMSA_REGPROTVMXVPCLMULQDQVTEWAITPKGWBNOINVDWRMSRNSX87XGETBV1XOPXSAVEXSAVECXSAVEOPTXSAVESAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID" +const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXFP16AMXINT8AMXTILEAPX_FAVXAVX10AVX10_128AVX10_256AVX10_512AVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXIFMAAVXNECONVERTAVXSLOWAVXVNNIAVXVNNIINT8BHI_CTRLBMI1BMI2CETIBTCETSSCLDEMOTECLMULCLZEROCMOVCMPCCXADDCMPSB_SCADBS_SHORTCMPXCHG8CPBOOSTCPPCCX16EFER_LMSLE_UNSENQCMDERMSF16CFLUSH_L1DFMA3FMA4FP128FP256FSRMFXSRFXSROPTGFNIHLEHRESETHTTHWAHYBRID_CPUHYPERVISORIA32_ARCH_CAPIA32_CORE_CAPIBPBIBPB_BRTYPEIBRSIBRS_PREFERREDIBRS_PROVIDES_SMPIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKIBS_FETCH_CTLXIBS_OPDATA4IBS_OPFUSEIBS_PREVENTHOSTIBS_ZEN4IDPRED_CTRLINT_WBINVDINVLPGBKEYLOCKERKEYLOCKERWLAHFLAMLBRVIRTLZCNTMCAOVERFLOWMCDT_NOMCOMMITMD_CLEARMMXMMXEXTMOVBEMOVDIR64BMOVDIRIMOVSB_ZLMOVUMPXMSRIRCMSRLISTMSR_PAGEFLUSHNRIPSNXOSXSAVEPCONFIGPOPCNTPPINPREFETCHIPSFDRDPRURDRANDRDSEEDRDTSCPRRSBA_CTRLRTMRTM_ALWAYS_ABORTSBPBSERIALIZESEVSEV_64BITSEV_ALTERNATIVESEV_DEBUGSWAPSEV_ESSEV_RESTRICTEDSEV_SNPSGXSGXLCSHASMESME_COHERENTSPEC_CTRL_SSBDSRBDS_CTRLSRSO_MSR_FIXSRSO_NOSRSO_USER_KERNEL_NOSSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSTIBP_ALWAYSONSTOSB_SHORTSUCCORSVMSVMDASVMFBASIDSVMLSVMNPSVMPFSVMPFTSYSCALLSYSEETBMTDX_GUESTTLB_FLUSH_NESTEDTMETOPEXTTSCRATEMSRTSXLDTRKVAESVMCBCLEANVMPLVMSA_REGPROTVMXVPCLMULQDQVTEWAITPKGWBNOINVDWRMSRNSX87XGETBV1XOPXSAVEXSAVECXSAVEOPTXSAVESAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID" -var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 62, 65, 69, 79, 91, 99, 107, 115, 123, 130, 140, 150, 158, 168, 179, 187, 197, 215, 230, 237, 249, 256, 263, 274, 282, 286, 290, 296, 301, 309, 314, 320, 324, 333, 351, 359, 366, 370, 374, 388, 394, 398, 402, 411, 415, 419, 424, 429, 433, 437, 444, 448, 451, 457, 460, 463, 473, 483, 496, 509, 513, 517, 531, 548, 551, 561, 572, 578, 586, 597, 605, 617, 633, 647, 658, 668, 683, 691, 702, 712, 719, 723, 726, 733, 738, 749, 756, 763, 771, 774, 780, 785, 794, 801, 809, 813, 816, 822, 829, 842, 847, 849, 856, 863, 869, 873, 882, 886, 891, 897, 903, 909, 919, 922, 938, 947, 950, 959, 974, 987, 993, 1007, 1014, 1017, 1022, 1025, 1028, 1040, 1054, 1064, 1067, 1071, 1075, 1079, 1084, 1089, 1094, 1099, 1113, 1124, 1130, 1133, 1138, 1147, 1151, 1156, 1161, 1167, 1174, 1179, 1182, 1191, 1207, 1210, 1216, 1226, 1234, 1238, 1247, 1251, 1263, 1266, 1276, 1279, 1286, 1294, 1301, 1304, 1311, 1314, 1319, 1325, 1333, 1339, 1345, 1353, 1358, 1365, 1372, 1380, 1387, 1392, 1397, 1404, 1408, 1410, 1414, 1417, 1422, 1427, 1432, 1436, 1440, 1444, 1450, 1453, 1456, 1459, 1465} +var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 62, 67, 70, 75, 84, 93, 102, 106, 116, 128, 136, 144, 152, 160, 167, 177, 187, 195, 205, 216, 224, 234, 252, 267, 274, 286, 293, 300, 311, 319, 323, 327, 333, 338, 346, 351, 357, 361, 370, 388, 396, 403, 407, 411, 425, 431, 435, 439, 448, 452, 456, 461, 466, 470, 474, 481, 485, 488, 494, 497, 500, 510, 520, 533, 546, 550, 561, 565, 579, 596, 599, 609, 620, 626, 634, 645, 653, 665, 681, 695, 706, 716, 731, 739, 750, 760, 767, 776, 786, 790, 793, 800, 805, 816, 823, 830, 838, 841, 847, 852, 861, 868, 876, 880, 883, 889, 896, 909, 914, 916, 923, 930, 936, 940, 949, 953, 958, 964, 970, 976, 986, 989, 1005, 1009, 1018, 1021, 1030, 1045, 1058, 1064, 1078, 1085, 1088, 1093, 1096, 1099, 1111, 1125, 1135, 1147, 1154, 1173, 1176, 1180, 1184, 1188, 1193, 1198, 1203, 1208, 1222, 1233, 1239, 1242, 1247, 1256, 1260, 1265, 1270, 1276, 1283, 1288, 1291, 1300, 1316, 1319, 1325, 1335, 1343, 1347, 1356, 1360, 1372, 1375, 1385, 1388, 1395, 1403, 1410, 1413, 1420, 1423, 1428, 1434, 1442, 1448, 1454, 1462, 1467, 1474, 1481, 1489, 1496, 1501, 1506, 1513, 1517, 1519, 1523, 1526, 1531, 1536, 1541, 1545, 1549, 1553, 1559, 1562, 1565, 1568, 1574} func (i FeatureID) String() string { if i < 0 || i >= FeatureID(len(_FeatureID_index)-1) { diff --git a/vendor/github.com/klauspost/reedsolomon/.gitignore b/vendor/github.com/klauspost/reedsolomon/.gitignore new file mode 100644 index 000000000..59610b561 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/.gitignore @@ -0,0 +1,26 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test +*.prof + +.idea \ No newline at end of file diff --git a/vendor/github.com/klauspost/reedsolomon/LICENSE b/vendor/github.com/klauspost/reedsolomon/LICENSE new file mode 100644 index 000000000..a947e162b --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/LICENSE @@ -0,0 +1,23 @@ +The MIT License (MIT) + +Copyright (c) 2015 Klaus Post +Copyright (c) 2015 Backblaze + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/vendor/github.com/klauspost/reedsolomon/README.md b/vendor/github.com/klauspost/reedsolomon/README.md new file mode 100644 index 000000000..bdcb9e787 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/README.md @@ -0,0 +1,566 @@ +# Reed-Solomon +[![Go Reference](https://pkg.go.dev/badge/github.com/klauspost/reedsolomon.svg)](https://pkg.go.dev/github.com/klauspost/reedsolomon) [![Go](https://github.com/klauspost/reedsolomon/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/reedsolomon/actions/workflows/go.yml) + +Reed-Solomon Erasure Coding in Go, with speeds exceeding 1GB/s/cpu core implemented in pure Go. + +This is a Go port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by +[Backblaze](http://backblaze.com), with some additional optimizations. + +For an introduction on erasure coding, see the post on the [Backblaze blog](https://www.backblaze.com/blog/reed-solomon/). + +For encoding high shard counts (>256) a Leopard implementation is used. +For most platforms this performs close to the original Leopard implementation in terms of speed. + +Package home: https://github.com/klauspost/reedsolomon + +Godoc: https://pkg.go.dev/github.com/klauspost/reedsolomon + +# Installation +To get the package use the standard: +```bash +go get -u github.com/klauspost/reedsolomon +``` + +Using Go modules is recommended. + +# Changes + +## 2022 + +* [GFNI](https://github.com/klauspost/reedsolomon/pull/224) support for amd64, for up to 3x faster processing. +* [Leopard GF8](https://github.com/klauspost/reedsolomon#leopard-gf8) mode added, for faster processing of medium shard counts. +* [Leopard GF16](https://github.com/klauspost/reedsolomon#leopard-compatible-gf16) mode added, for up to 65536 shards. +* [WithJerasureMatrix](https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc#WithJerasureMatrix) allows constructing a [Jerasure](https://github.com/tsuraan/Jerasure) compatible matrix. + +## 2021 + +* Use `GOAMD64=v4` to enable faster AVX2. +* Add progressive shard encoding. +* Wider AVX2 loops +* Limit concurrency on AVX2, since we are likely memory bound. +* Allow 0 parity shards. +* Allow disabling inversion cache. +* Faster AVX2 encoding. + +
+ See older changes + +## May 2020 + +* ARM64 optimizations, up to 2.5x faster. +* Added [WithFastOneParityMatrix](https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc#WithFastOneParityMatrix) for faster operation with 1 parity shard. +* Much better performance when using a limited number of goroutines. +* AVX512 is now using multiple cores. +* Stream processing overhaul, big speedups in most cases. +* AVX512 optimizations + +## March 6, 2019 + +The pure Go implementation is about 30% faster. Minor tweaks to assembler implementations. + +## February 8, 2019 + +AVX512 accelerated version added for Intel Skylake CPUs. This can give up to a 4x speed improvement as compared to AVX2. +See [here](https://github.com/klauspost/reedsolomon#performance-on-avx512) for more details. + +## December 18, 2018 + +Assembly code for ppc64le has been contributed, this boosts performance by about 10x on this platform. + +## November 18, 2017 + +Added [WithAutoGoroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithAutoGoroutines) which will attempt +to calculate the optimal number of goroutines to use based on your expected shard size and detected CPU. + +## October 1, 2017 + +* [Cauchy Matrix](https://godoc.org/github.com/klauspost/reedsolomon#WithCauchyMatrix) is now an option. +Thanks to [templexxx](https://github.com/templexxx) for the basis of this. + +* Default maximum number of [goroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithMaxGoroutines) +has been increased for better multi-core scaling. + +* After several requests the Reconstruct and ReconstructData now slices of zero length but sufficient capacity to +be used instead of allocating new memory. + +## August 26, 2017 + +* The [`Encoder()`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) now contains an `Update` +function contributed by [chenzhongtao](https://github.com/chenzhongtao). + +* [Frank Wessels](https://github.com/fwessels) kindly contributed ARM 64 bit assembly, +which gives a huge performance boost on this platform. + +## July 20, 2017 + +`ReconstructData` added to [`Encoder`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) interface. +This can cause compatibility issues if you implement your own Encoder. A simple workaround can be added: + +```Go +func (e *YourEnc) ReconstructData(shards [][]byte) error { + return ReconstructData(shards) +} +``` + +You can of course also do your own implementation. +The [`StreamEncoder`](https://godoc.org/github.com/klauspost/reedsolomon#StreamEncoder) +handles this without modifying the interface. +This is a good lesson on why returning interfaces is not a good design. + +
+ +# Usage + +This section assumes you know the basics of Reed-Solomon encoding. +A good start is this [Backblaze blog post](https://www.backblaze.com/blog/reed-solomon/). + +This package performs the calculation of the parity sets. The usage is therefore relatively simple. + +First of all, you need to choose your distribution of data and parity shards. +A 'good' distribution is very subjective, and will depend a lot on your usage scenario. + +To create an encoder with 10 data shards (where your data goes) and 3 parity shards (calculated): +```Go + enc, err := reedsolomon.New(10, 3) +``` +This encoder will work for all parity sets with this distribution of data and parity shards. + +If you will primarily be using it with one shard size it is recommended to use +[`WithAutoGoroutines(shardSize)`](https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc#WithAutoGoroutines) +as an additional parameter. This will attempt to calculate the optimal number of goroutines to use for the best speed. +It is not required that all shards are this size. + +Then you send and receive data that is a simple slice of byte slices; `[][]byte`. +In the example above, the top slice must have a length of 13. + +```Go + data := make([][]byte, 13) +``` +You should then fill the 10 first slices with *equally sized* data, +and create parity shards that will be populated with parity data. In this case we create the data in memory, +but you could for instance also use [mmap](https://github.com/edsrzf/mmap-go) to map files. + +```Go + // Create all shards, size them at 50000 each + for i := range input { + data[i] := make([]byte, 50000) + } + + // The above allocations can also be done by the encoder: + // data := enc.(reedsolomon.Extended).AllocAligned(50000) + + // Fill some data into the data shards + for i, in := range data[:10] { + for j:= range in { + in[j] = byte((i+j)&0xff) + } + } +``` + +To populate the parity shards, you simply call `Encode()` with your data. +```Go + err = enc.Encode(data) +``` +The only cases where you should get an error is, if the data shards aren't of equal size. +The last 3 shards now contain parity data. You can verify this by calling `Verify()`: + +```Go + ok, err = enc.Verify(data) +``` + +The final (and important) part is to be able to reconstruct missing shards. +For this to work, you need to know which parts of your data is missing. +The encoder *does not know which parts are invalid*, so if data corruption is a likely scenario, +you need to implement a hash check for each shard. + +If a byte has changed in your set, and you don't know which it is, there is no way to reconstruct the data set. + +To indicate missing data, you set the shard to nil before calling `Reconstruct()`: + +```Go + // Delete two data shards + data[3] = nil + data[7] = nil + + // Reconstruct the missing shards + err := enc.Reconstruct(data) +``` +The missing data and parity shards will be recreated. If more than 3 shards are missing, the reconstruction will fail. + +If you are only interested in the data shards (for reading purposes) you can call `ReconstructData()`: + +```Go + // Delete two data shards + data[3] = nil + data[7] = nil + + // Reconstruct just the missing data shards + err := enc.ReconstructData(data) +``` + +If you don't need all data shards you can use `ReconstructSome()`: + +```Go + // Delete two data shards + data[3] = nil + data[7] = nil + + // Reconstruct just the shard 3 + err := enc.ReconstructSome(data, []bool{false, false, false, true, false, false, false, false}) +``` + +So to sum up reconstruction: +* The number of data/parity shards must match the numbers used for encoding. +* The order of shards must be the same as used when encoding. +* You may only supply data you know is valid. +* Invalid shards should be set to nil. + +For complete examples of an encoder and decoder see the +[examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples). + +# Splitting/Joining Data + +You might have a large slice of data. +To help you split this, there are some helper functions that can split and join a single byte slice. + +```Go + bigfile, _ := ioutil.Readfile("myfile.data") + + // Split the file + split, err := enc.Split(bigfile) +``` +This will split the file into the number of data shards set when creating the encoder and create empty parity shards. + +An important thing to note is that you have to *keep track of the exact input size*. +If the size of the input isn't divisible by the number of data shards, extra zeros will be inserted in the last shard. + +To join a data set, use the `Join()` function, which will join the shards and write it to the `io.Writer` you supply: +```Go + // Join a data set and write it to io.Discard. + err = enc.Join(io.Discard, data, len(bigfile)) +``` + +## Aligned Allocations + +For AMD64 aligned inputs can make a big speed difference. + +This is an example of the speed difference when inputs are unaligned/aligned: + +``` +BenchmarkEncode100x20x10000-32 7058 172648 ns/op 6950.57 MB/s +BenchmarkEncode100x20x10000-32 8406 137911 ns/op 8701.24 MB/s +``` + +This is mostly the case when dealing with odd-sized shards. + +To facilitate this the package provides an `AllocAligned(shards, each int) [][]byte`. +This will allocate a number of shards, each with the size `each`. +Each shard will then be aligned to a 64 byte boundary. + +Each encoder also has a `AllocAligned(each int) [][]byte` as an extended interface which will return the same, +but with the shard count configured in the encoder. + +It is not possible to re-aligned already allocated slices, for example when using `Split`. +When it is not possible to write to aligned shards, you should not copy to them. + +# Progressive encoding + +It is possible to encode individual shards using EncodeIdx: + +```Go + // EncodeIdx will add parity for a single data shard. + // Parity shards should start out as 0. The caller must zero them. + // Data shards must be delivered exactly once. There is no check for this. + // The parity shards will always be updated and the data shards will remain the same. + EncodeIdx(dataShard []byte, idx int, parity [][]byte) error +``` + +This allows progressively encoding the parity by sending individual data shards. +There is no requirement on shards being delivered in order, +but when sent in order it allows encoding shards one at the time, +effectively allowing the operation to be streaming. + +The result will be the same as encoding all shards at once. +There is a minor speed penalty using this method, so send +shards at once if they are available. + +## Example + +```Go +func test() { + // Create an encoder with 7 data and 3 parity slices. + enc, _ := reedsolomon.New(7, 3) + + // This will be our output parity. + parity := make([][]byte, 3) + for i := range parity { + parity[i] = make([]byte, 10000) + } + + for i := 0; i < 7; i++ { + // Send data shards one at the time. + _ = enc.EncodeIdx(make([]byte, 10000), i, parity) + } + + // parity now contains parity, as if all data was sent in one call. +} +``` + +# Streaming/Merging + +It might seem like a limitation that all data should be in memory, +but an important property is that *as long as the number of data/parity shards are the same, +you can merge/split data sets*, and they will remain valid as a separate set. + +```Go + // Split the data set of 50000 elements into two of 25000 + splitA := make([][]byte, 13) + splitB := make([][]byte, 13) + + // Merge into a 100000 element set + merged := make([][]byte, 13) + + for i := range data { + splitA[i] = data[i][:25000] + splitB[i] = data[i][25000:] + + // Concatenate it to itself + merged[i] = append(make([]byte, 0, len(data[i])*2), data[i]...) + merged[i] = append(merged[i], data[i]...) + } + + // Each part should still verify as ok. + ok, err := enc.Verify(splitA) + if ok && err == nil { + log.Println("splitA ok") + } + + ok, err = enc.Verify(splitB) + if ok && err == nil { + log.Println("splitB ok") + } + + ok, err = enc.Verify(merge) + if ok && err == nil { + log.Println("merge ok") + } +``` + +This means that if you have a data set that may not fit into memory, you can split processing into smaller blocks. +For the best throughput, don't use too small blocks. + +This also means that you can divide big input up into smaller blocks, and do reconstruction on parts of your data. +This doesn't give the same flexibility of a higher number of data shards, but it will be much more performant. + +# Streaming API + +There has been added support for a streaming API, to help perform fully streaming operations, +which enables you to do the same operations, but on streams. +To use the stream API, use [`NewStream`](https://godoc.org/github.com/klauspost/reedsolomon#NewStream) function +to create the encoding/decoding interfaces. + +You can use [`WithConcurrentStreams`](https://godoc.org/github.com/klauspost/reedsolomon#WithConcurrentStreams) +to ready an interface that reads/writes concurrently from the streams. + +You can specify the size of each operation using +[`WithStreamBlockSize`](https://godoc.org/github.com/klauspost/reedsolomon#WithStreamBlockSize). +This will set the size of each read/write operation. + +Input is delivered as `[]io.Reader`, output as `[]io.Writer`, and functionality corresponds to the in-memory API. +Each stream must supply the same amount of data, similar to how each slice must be similar size with the in-memory API. +If an error occurs in relation to a stream, +a [`StreamReadError`](https://godoc.org/github.com/klauspost/reedsolomon#StreamReadError) +or [`StreamWriteError`](https://godoc.org/github.com/klauspost/reedsolomon#StreamWriteError) +will help you determine which stream was the offender. + +There is no buffering or timeouts/retry specified. If you want to add that, you need to add it to the Reader/Writer. + +For complete examples of a streaming encoder and decoder see the +[examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples). + +GF16 (more than 256 shards) is not supported by the streaming interface. + +# Advanced Options + +You can modify internal options which affects how jobs are split between and processed by goroutines. + +To create options, use the WithXXX functions. You can supply options to `New`, `NewStream`. +If no Options are supplied, default options are used. + +Example of how to supply options: + + ```Go + enc, err := reedsolomon.New(10, 3, WithMaxGoroutines(25)) + ``` + +# Leopard Compatible GF16 + +When you encode more than 256 shards the library will switch to a [Leopard-RS](https://github.com/catid/leopard) implementation. + +This allows encoding up to 65536 shards (data+parity) with the following limitations, similar to leopard: + +* The original and recovery data must not exceed 65536 pieces. +* The shard size *must* each be a multiple of 64 bytes. +* Each buffer should have the same number of bytes. +* Even the last shard must be rounded up to the block size. + +| | Regular | Leopard | +|-----------------|---------|---------| +| Encode | ✓ | ✓ | +| EncodeIdx | ✓ | - | +| Verify | ✓ | ✓ | +| Reconstruct | ✓ | ✓ | +| ReconstructData | ✓ | ✓ | +| ReconstructSome | ✓ | ✓ (+) | +| Update | ✓ | - | +| Split | ✓ | ✓ | +| Join | ✓ | ✓ | + +* (+) Same as calling `ReconstructData`. + +The Split/Join functions will help to split an input to the proper sizes. + +Speed can be expected to be `O(N*log(N))`, compared to the `O(N*N)`. +Reconstruction matrix calculation is more time-consuming, +so be sure to include that as part of any benchmark you run. + +For now SSSE3, AVX2 and AVX512 assembly are available on AMD64 platforms. + +Leopard mode currently always runs as a single goroutine, since multiple +goroutines doesn't provide any worthwhile speedup. + +## Leopard GF8 + +It is possible to replace the default reed-solomon encoder with a leopard compatible one. +This will typically be faster when dealing with more than 20-30 shards. +Note that the limitations listed above also applies to this mode. +See table below for speed with different number of shards. + +To enable Leopard GF8 mode use `WithLeopardGF(true)`. + +Benchmark Encoding and Reconstructing *1KB* shards with variable number of shards. +All implementation use inversion cache when available. +Speed is total shard size for each operation. Data shard throughput is speed/2. +AVX2 is used. + +| Encoder | Shards | Encode | Recover All | Recover One | +|--------------|-------------|----------------|--------------|----------------| +| Cauchy | 4+4 | 23076.83 MB/s | 5444.02 MB/s | 10834.67 MB/s | +| Cauchy | 8+8 | 15206.87 MB/s | 4223.42 MB/s | 16181.62 MB/s | +| Cauchy | 16+16 | 7427.47 MB/s | 3305.84 MB/s | 22480.41 MB/s | +| Cauchy | 32+32 | 3785.64 MB/s | 2300.07 MB/s | 26181.31 MB/s | +| Cauchy | 64+64 | 1911.93 MB/s | 1368.51 MB/s | 27992.93 MB/s | +| Cauchy | 128+128 | 963.83 MB/s | 1327.56 MB/s | 32866.86 MB/s | +| Leopard GF8 | 4+4 | 17061.28 MB/s | 3099.06 MB/s | 4096.78 MB/s | +| Leopard GF8 | 8+8 | 10546.67 MB/s | 2925.92 MB/s | 3964.00 MB/s | +| Leopard GF8 | 16+16 | 10961.37 MB/s | 2328.40 MB/s | 3110.22 MB/s | +| Leopard GF8 | 32+32 | 7111.47 MB/s | 2374.61 MB/s | 3220.75 MB/s | +| Leopard GF8 | 64+64 | 7468.57 MB/s | 2055.41 MB/s | 3061.81 MB/s | +| Leopard GF8 | 128+128 | 5479.99 MB/s | 1953.21 MB/s | 2815.15 MB/s | +| Leopard GF16 | 256+256 | 6158.66 MB/s | 454.14 MB/s | 506.70 MB/s | +| Leopard GF16 | 512+512 | 4418.58 MB/s | 685.75 MB/s | 801.63 MB/s | +| Leopard GF16 | 1024+1024 | 4778.05 MB/s | 814.51 MB/s | 1080.19 MB/s | +| Leopard GF16 | 2048+2048 | 3417.05 MB/s | 911.64 MB/s | 1179.48 MB/s | +| Leopard GF16 | 4096+4096 | 3209.41 MB/s | 729.13 MB/s | 1135.06 MB/s | +| Leopard GF16 | 8192+8192 | 2034.11 MB/s | 604.52 MB/s | 842.13 MB/s | +| Leopard GF16 | 16384+16384 | 1525.88 MB/s | 486.74 MB/s | 750.01 MB/s | +| Leopard GF16 | 32768+32768 | 1138.67 MB/s | 482.81 MB/s | 712.73 MB/s | + +"Traditional" encoding is faster until somewhere between 16 and 32 shards. +Leopard provides fast encoding in all cases, but shows a significant overhead for reconstruction. + +Calculating the reconstruction matrix takes a significant amount of computation. +With bigger shards that will be smaller. Arguably, fewer shards typically also means bigger shards. +Due to the high shard count caching reconstruction matrices generally isn't feasible for Leopard. + +# Performance + +Performance depends mainly on the number of parity shards. +In rough terms, doubling the number of parity shards will double the encoding time. + +Here are the throughput numbers with some different selections of data and parity shards. +For reference each shard is 1MB random data, and 16 CPU cores are used for encoding. + +| Data | Parity | Go MB/s | SSSE3 MB/s | AVX2 MB/s | +|------|--------|---------|------------|-----------| +| 5 | 2 | 20,772 | 66,355 | 108,755 | +| 8 | 8 | 6,815 | 38,338 | 70,516 | +| 10 | 4 | 9,245 | 48,237 | 93,875 | +| 50 | 20 | 2,063 | 12,130 | 22,828 | + +The throughput numbers here is the size of the encoded data and parity shards. + +If `runtime.GOMAXPROCS()` is set to a value higher than 1, +the encoder will use multiple goroutines to perform the calculations in `Verify`, `Encode` and `Reconstruct`. + + +Benchmarking `Reconstruct()` followed by a `Verify()` (=`all`) versus just calling `ReconstructData()` (=`data`) gives the following result: +``` +benchmark all MB/s data MB/s speedup +BenchmarkReconstruct10x2x10000-8 2011.67 10530.10 5.23x +BenchmarkReconstruct50x5x50000-8 4585.41 14301.60 3.12x +BenchmarkReconstruct10x2x1M-8 8081.15 28216.41 3.49x +BenchmarkReconstruct5x2x1M-8 5780.07 28015.37 4.85x +BenchmarkReconstruct10x4x1M-8 4352.56 14367.61 3.30x +BenchmarkReconstruct50x20x1M-8 1364.35 4189.79 3.07x +BenchmarkReconstruct10x4x16M-8 1484.35 5779.53 3.89x +``` + +The package will use [GFNI](https://en.wikipedia.org/wiki/AVX-512#GFNI) instructions combined with AVX512 when these are available. +This further improves speed by up to 3x over AVX2 code paths. + +## ARM64 NEON + +By exploiting NEON instructions the performance for ARM has been accelerated. +Below are the performance numbers for a single core on an EC2 m6g.16xlarge (Graviton2) instance (Amazon Linux 2): + +``` +BenchmarkGalois128K-64 119562 10028 ns/op 13070.78 MB/s +BenchmarkGalois1M-64 14380 83424 ns/op 12569.22 MB/s +BenchmarkGaloisXor128K-64 96508 12432 ns/op 10543.29 MB/s +BenchmarkGaloisXor1M-64 10000 100322 ns/op 10452.13 MB/s +``` + +# Performance on ppc64le + +The performance for ppc64le has been accelerated. +This gives roughly a 10x performance improvement on this architecture as can be seen below: + +``` +benchmark old MB/s new MB/s speedup +BenchmarkGalois128K-160 948.87 8878.85 9.36x +BenchmarkGalois1M-160 968.85 9041.92 9.33x +BenchmarkGaloisXor128K-160 862.02 7905.00 9.17x +BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x +``` + +# Legal + +> None of section below is legal advice. Seek your own legal counsel. +> As stated by the [LICENSE](LICENSE) the authors will not be held reliable for any use of this library. +> Users are encouraged to independently verify they comply with all legal requirements. + +As can be seen in [recent news](https://www.datanami.com/2023/10/16/cloudera-hit-with-240-million-judgement-over-erasure-coding/) +there has been lawsuits related to possible patents of aspects of erasure coding functionality. + +As a possible mitigation it is possible to use the tag `nopshufb` when compiling any code which includes this package. +This will remove all inclusion and use of `PSHUFB` and equivalent on other platforms. + +This is done by adding `-tags=nopshufb` to `go build` and similar commands that produce binary output. + +The removed code may not be infringing and even after `-tags=nopshufb` there may still be infringing code left. + +# Links +* [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/). +* [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon). Compatible java library by Backblaze. +* [ocaml-reed-solomon-erasure](https://gitlab.com/darrenldl/ocaml-reed-solomon-erasure). Compatible OCaml implementation. +* [reedsolomon-c](https://github.com/jannson/reedsolomon-c). C version, compatible with output from this package. +* [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance. +* [reed-solomon-erasure](https://github.com/darrenldl/reed-solomon-erasure). Compatible Rust implementation. +* [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests. +* [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations. +* [Leopard-RS](https://github.com/catid/leopard) C library used as basis for GF16 implementation. + +# License + +This code, as the original [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) is published under an MIT license. See LICENSE file for more information. diff --git a/vendor/github.com/klauspost/reedsolomon/galois.go b/vendor/github.com/klauspost/reedsolomon/galois.go new file mode 100644 index 000000000..697f9ca67 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois.go @@ -0,0 +1,979 @@ +/** + * 8-bit Galois Field + * Copyright 2015, Klaus Post + * Copyright 2015, Backblaze, Inc. All rights reserved. + */ + +package reedsolomon + +import ( + "encoding/binary" +) + +const ( + // The number of elements in the field. + fieldSize = 256 + + // The polynomial used to generate the logarithm table. + // + // There are a number of polynomials that work to generate + // a Galois field of 256 elements. The choice is arbitrary, + // and we just use the first one. + // + // The possibilities are: 29, 43, 45, 77, 95, 99, 101, 105, + //* 113, 135, 141, 169, 195, 207, 231, and 245. + generatingPolynomial = 29 +) + +var logTable = [fieldSize]byte{ + 0, 0, 1, 25, 2, 50, 26, 198, + 3, 223, 51, 238, 27, 104, 199, 75, + 4, 100, 224, 14, 52, 141, 239, 129, + 28, 193, 105, 248, 200, 8, 76, 113, + 5, 138, 101, 47, 225, 36, 15, 33, + 53, 147, 142, 218, 240, 18, 130, 69, + 29, 181, 194, 125, 106, 39, 249, 185, + 201, 154, 9, 120, 77, 228, 114, 166, + 6, 191, 139, 98, 102, 221, 48, 253, + 226, 152, 37, 179, 16, 145, 34, 136, + 54, 208, 148, 206, 143, 150, 219, 189, + 241, 210, 19, 92, 131, 56, 70, 64, + 30, 66, 182, 163, 195, 72, 126, 110, + 107, 58, 40, 84, 250, 133, 186, 61, + 202, 94, 155, 159, 10, 21, 121, 43, + 78, 212, 229, 172, 115, 243, 167, 87, + 7, 112, 192, 247, 140, 128, 99, 13, + 103, 74, 222, 237, 49, 197, 254, 24, + 227, 165, 153, 119, 38, 184, 180, 124, + 17, 68, 146, 217, 35, 32, 137, 46, + 55, 63, 209, 91, 149, 188, 207, 205, + 144, 135, 151, 178, 220, 252, 190, 97, + 242, 86, 211, 171, 20, 42, 93, 158, + 132, 60, 57, 83, 71, 109, 65, 162, + 31, 45, 67, 216, 183, 123, 164, 118, + 196, 23, 73, 236, 127, 12, 111, 246, + 108, 161, 59, 82, 41, 157, 85, 170, + 251, 96, 134, 177, 187, 204, 62, 90, + 203, 89, 95, 176, 156, 169, 160, 81, + 11, 245, 22, 235, 122, 117, 44, 215, + 79, 174, 213, 233, 230, 231, 173, 232, + 116, 214, 244, 234, 168, 80, 88, 175, +} + +/** + * Inverse of the logarithm table. Maps integer logarithms + * to members of the field. Entry 255 is the same as entry 0 sue to mod 255. + * + * This table was generated by `go run gentables.go` + * Table has been truncated to 256 bytes, since no lookups are bigger. + */ +var expTable = [256]byte{0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 0x8f, 0x3, 0x6, 0xc, 0x18, 0x30, 0x60, 0xc0, 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 0x46, 0x8c, 0x5, 0xa, 0x14, 0x28, 0x50, 0xa0, 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 0x65, 0xca, 0x89, 0xf, 0x1e, 0x3c, 0x78, 0xf0, 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 0xd, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x7, 0xe, 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x9, 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0xb, 0x16, 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x1} + +func galAdd(a, b byte) byte { + return a ^ b +} + +// Table from https://github.com/templexxx/reedsolomon +var invTable = [256]byte{0x0, 0x1, 0x8e, 0xf4, 0x47, 0xa7, 0x7a, 0xba, 0xad, 0x9d, 0xdd, 0x98, 0x3d, 0xaa, 0x5d, 0x96, 0xd8, 0x72, 0xc0, 0x58, 0xe0, 0x3e, 0x4c, 0x66, 0x90, 0xde, 0x55, 0x80, 0xa0, 0x83, 0x4b, 0x2a, 0x6c, 0xed, 0x39, 0x51, 0x60, 0x56, 0x2c, 0x8a, 0x70, 0xd0, 0x1f, 0x4a, 0x26, 0x8b, 0x33, 0x6e, 0x48, 0x89, 0x6f, 0x2e, 0xa4, 0xc3, 0x40, 0x5e, 0x50, 0x22, 0xcf, 0xa9, 0xab, 0xc, 0x15, 0xe1, 0x36, 0x5f, 0xf8, 0xd5, 0x92, 0x4e, 0xa6, 0x4, 0x30, 0x88, 0x2b, 0x1e, 0x16, 0x67, 0x45, 0x93, 0x38, 0x23, 0x68, 0x8c, 0x81, 0x1a, 0x25, 0x61, 0x13, 0xc1, 0xcb, 0x63, 0x97, 0xe, 0x37, 0x41, 0x24, 0x57, 0xca, 0x5b, 0xb9, 0xc4, 0x17, 0x4d, 0x52, 0x8d, 0xef, 0xb3, 0x20, 0xec, 0x2f, 0x32, 0x28, 0xd1, 0x11, 0xd9, 0xe9, 0xfb, 0xda, 0x79, 0xdb, 0x77, 0x6, 0xbb, 0x84, 0xcd, 0xfe, 0xfc, 0x1b, 0x54, 0xa1, 0x1d, 0x7c, 0xcc, 0xe4, 0xb0, 0x49, 0x31, 0x27, 0x2d, 0x53, 0x69, 0x2, 0xf5, 0x18, 0xdf, 0x44, 0x4f, 0x9b, 0xbc, 0xf, 0x5c, 0xb, 0xdc, 0xbd, 0x94, 0xac, 0x9, 0xc7, 0xa2, 0x1c, 0x82, 0x9f, 0xc6, 0x34, 0xc2, 0x46, 0x5, 0xce, 0x3b, 0xd, 0x3c, 0x9c, 0x8, 0xbe, 0xb7, 0x87, 0xe5, 0xee, 0x6b, 0xeb, 0xf2, 0xbf, 0xaf, 0xc5, 0x64, 0x7, 0x7b, 0x95, 0x9a, 0xae, 0xb6, 0x12, 0x59, 0xa5, 0x35, 0x65, 0xb8, 0xa3, 0x9e, 0xd2, 0xf7, 0x62, 0x5a, 0x85, 0x7d, 0xa8, 0x3a, 0x29, 0x71, 0xc8, 0xf6, 0xf9, 0x43, 0xd7, 0xd6, 0x10, 0x73, 0x76, 0x78, 0x99, 0xa, 0x19, 0x91, 0x14, 0x3f, 0xe6, 0xf0, 0x86, 0xb1, 0xe2, 0xf1, 0xfa, 0x74, 0xf3, 0xb4, 0x6d, 0x21, 0xb2, 0x6a, 0xe3, 0xe7, 0xb5, 0xea, 0x3, 0x8f, 0xd3, 0xc9, 0x42, 0xd4, 0xe8, 0x75, 0x7f, 0xff, 0x7e, 0xfd} + +var mulTable = [256][256]uint8{{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff}, + {0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, 0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, 0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde, 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, 0x1d, 0x1f, 0x19, 0x1b, 0x15, 0x17, 0x11, 0x13, 0xd, 0xf, 0x9, 0xb, 0x5, 0x7, 0x1, 0x3, 0x3d, 0x3f, 0x39, 0x3b, 0x35, 0x37, 0x31, 0x33, 0x2d, 0x2f, 0x29, 0x2b, 0x25, 0x27, 0x21, 0x23, 0x5d, 0x5f, 0x59, 0x5b, 0x55, 0x57, 0x51, 0x53, 0x4d, 0x4f, 0x49, 0x4b, 0x45, 0x47, 0x41, 0x43, 0x7d, 0x7f, 0x79, 0x7b, 0x75, 0x77, 0x71, 0x73, 0x6d, 0x6f, 0x69, 0x6b, 0x65, 0x67, 0x61, 0x63, 0x9d, 0x9f, 0x99, 0x9b, 0x95, 0x97, 0x91, 0x93, 0x8d, 0x8f, 0x89, 0x8b, 0x85, 0x87, 0x81, 0x83, 0xbd, 0xbf, 0xb9, 0xbb, 0xb5, 0xb7, 0xb1, 0xb3, 0xad, 0xaf, 0xa9, 0xab, 0xa5, 0xa7, 0xa1, 0xa3, 0xdd, 0xdf, 0xd9, 0xdb, 0xd5, 0xd7, 0xd1, 0xd3, 0xcd, 0xcf, 0xc9, 0xcb, 0xc5, 0xc7, 0xc1, 0xc3, 0xfd, 0xff, 0xf9, 0xfb, 0xf5, 0xf7, 0xf1, 0xf3, 0xed, 0xef, 0xe9, 0xeb, 0xe5, 0xe7, 0xe1, 0xe3}, + {0x0, 0x3, 0x6, 0x5, 0xc, 0xf, 0xa, 0x9, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11, 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, 0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71, 0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41, 0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1, 0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1, 0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1, 0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81, 0x9d, 0x9e, 0x9b, 0x98, 0x91, 0x92, 0x97, 0x94, 0x85, 0x86, 0x83, 0x80, 0x89, 0x8a, 0x8f, 0x8c, 0xad, 0xae, 0xab, 0xa8, 0xa1, 0xa2, 0xa7, 0xa4, 0xb5, 0xb6, 0xb3, 0xb0, 0xb9, 0xba, 0xbf, 0xbc, 0xfd, 0xfe, 0xfb, 0xf8, 0xf1, 0xf2, 0xf7, 0xf4, 0xe5, 0xe6, 0xe3, 0xe0, 0xe9, 0xea, 0xef, 0xec, 0xcd, 0xce, 0xcb, 0xc8, 0xc1, 0xc2, 0xc7, 0xc4, 0xd5, 0xd6, 0xd3, 0xd0, 0xd9, 0xda, 0xdf, 0xdc, 0x5d, 0x5e, 0x5b, 0x58, 0x51, 0x52, 0x57, 0x54, 0x45, 0x46, 0x43, 0x40, 0x49, 0x4a, 0x4f, 0x4c, 0x6d, 0x6e, 0x6b, 0x68, 0x61, 0x62, 0x67, 0x64, 0x75, 0x76, 0x73, 0x70, 0x79, 0x7a, 0x7f, 0x7c, 0x3d, 0x3e, 0x3b, 0x38, 0x31, 0x32, 0x37, 0x34, 0x25, 0x26, 0x23, 0x20, 0x29, 0x2a, 0x2f, 0x2c, 0xd, 0xe, 0xb, 0x8, 0x1, 0x2, 0x7, 0x4, 0x15, 0x16, 0x13, 0x10, 0x19, 0x1a, 0x1f, 0x1c}, + {0x0, 0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c, 0x40, 0x44, 0x48, 0x4c, 0x50, 0x54, 0x58, 0x5c, 0x60, 0x64, 0x68, 0x6c, 0x70, 0x74, 0x78, 0x7c, 0x80, 0x84, 0x88, 0x8c, 0x90, 0x94, 0x98, 0x9c, 0xa0, 0xa4, 0xa8, 0xac, 0xb0, 0xb4, 0xb8, 0xbc, 0xc0, 0xc4, 0xc8, 0xcc, 0xd0, 0xd4, 0xd8, 0xdc, 0xe0, 0xe4, 0xe8, 0xec, 0xf0, 0xf4, 0xf8, 0xfc, 0x1d, 0x19, 0x15, 0x11, 0xd, 0x9, 0x5, 0x1, 0x3d, 0x39, 0x35, 0x31, 0x2d, 0x29, 0x25, 0x21, 0x5d, 0x59, 0x55, 0x51, 0x4d, 0x49, 0x45, 0x41, 0x7d, 0x79, 0x75, 0x71, 0x6d, 0x69, 0x65, 0x61, 0x9d, 0x99, 0x95, 0x91, 0x8d, 0x89, 0x85, 0x81, 0xbd, 0xb9, 0xb5, 0xb1, 0xad, 0xa9, 0xa5, 0xa1, 0xdd, 0xd9, 0xd5, 0xd1, 0xcd, 0xc9, 0xc5, 0xc1, 0xfd, 0xf9, 0xf5, 0xf1, 0xed, 0xe9, 0xe5, 0xe1, 0x3a, 0x3e, 0x32, 0x36, 0x2a, 0x2e, 0x22, 0x26, 0x1a, 0x1e, 0x12, 0x16, 0xa, 0xe, 0x2, 0x6, 0x7a, 0x7e, 0x72, 0x76, 0x6a, 0x6e, 0x62, 0x66, 0x5a, 0x5e, 0x52, 0x56, 0x4a, 0x4e, 0x42, 0x46, 0xba, 0xbe, 0xb2, 0xb6, 0xaa, 0xae, 0xa2, 0xa6, 0x9a, 0x9e, 0x92, 0x96, 0x8a, 0x8e, 0x82, 0x86, 0xfa, 0xfe, 0xf2, 0xf6, 0xea, 0xee, 0xe2, 0xe6, 0xda, 0xde, 0xd2, 0xd6, 0xca, 0xce, 0xc2, 0xc6, 0x27, 0x23, 0x2f, 0x2b, 0x37, 0x33, 0x3f, 0x3b, 0x7, 0x3, 0xf, 0xb, 0x17, 0x13, 0x1f, 0x1b, 0x67, 0x63, 0x6f, 0x6b, 0x77, 0x73, 0x7f, 0x7b, 0x47, 0x43, 0x4f, 0x4b, 0x57, 0x53, 0x5f, 0x5b, 0xa7, 0xa3, 0xaf, 0xab, 0xb7, 0xb3, 0xbf, 0xbb, 0x87, 0x83, 0x8f, 0x8b, 0x97, 0x93, 0x9f, 0x9b, 0xe7, 0xe3, 0xef, 0xeb, 0xf7, 0xf3, 0xff, 0xfb, 0xc7, 0xc3, 0xcf, 0xcb, 0xd7, 0xd3, 0xdf, 0xdb}, + {0x0, 0x5, 0xa, 0xf, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33, 0x50, 0x55, 0x5a, 0x5f, 0x44, 0x41, 0x4e, 0x4b, 0x78, 0x7d, 0x72, 0x77, 0x6c, 0x69, 0x66, 0x63, 0xa0, 0xa5, 0xaa, 0xaf, 0xb4, 0xb1, 0xbe, 0xbb, 0x88, 0x8d, 0x82, 0x87, 0x9c, 0x99, 0x96, 0x93, 0xf0, 0xf5, 0xfa, 0xff, 0xe4, 0xe1, 0xee, 0xeb, 0xd8, 0xdd, 0xd2, 0xd7, 0xcc, 0xc9, 0xc6, 0xc3, 0x5d, 0x58, 0x57, 0x52, 0x49, 0x4c, 0x43, 0x46, 0x75, 0x70, 0x7f, 0x7a, 0x61, 0x64, 0x6b, 0x6e, 0xd, 0x8, 0x7, 0x2, 0x19, 0x1c, 0x13, 0x16, 0x25, 0x20, 0x2f, 0x2a, 0x31, 0x34, 0x3b, 0x3e, 0xfd, 0xf8, 0xf7, 0xf2, 0xe9, 0xec, 0xe3, 0xe6, 0xd5, 0xd0, 0xdf, 0xda, 0xc1, 0xc4, 0xcb, 0xce, 0xad, 0xa8, 0xa7, 0xa2, 0xb9, 0xbc, 0xb3, 0xb6, 0x85, 0x80, 0x8f, 0x8a, 0x91, 0x94, 0x9b, 0x9e, 0xba, 0xbf, 0xb0, 0xb5, 0xae, 0xab, 0xa4, 0xa1, 0x92, 0x97, 0x98, 0x9d, 0x86, 0x83, 0x8c, 0x89, 0xea, 0xef, 0xe0, 0xe5, 0xfe, 0xfb, 0xf4, 0xf1, 0xc2, 0xc7, 0xc8, 0xcd, 0xd6, 0xd3, 0xdc, 0xd9, 0x1a, 0x1f, 0x10, 0x15, 0xe, 0xb, 0x4, 0x1, 0x32, 0x37, 0x38, 0x3d, 0x26, 0x23, 0x2c, 0x29, 0x4a, 0x4f, 0x40, 0x45, 0x5e, 0x5b, 0x54, 0x51, 0x62, 0x67, 0x68, 0x6d, 0x76, 0x73, 0x7c, 0x79, 0xe7, 0xe2, 0xed, 0xe8, 0xf3, 0xf6, 0xf9, 0xfc, 0xcf, 0xca, 0xc5, 0xc0, 0xdb, 0xde, 0xd1, 0xd4, 0xb7, 0xb2, 0xbd, 0xb8, 0xa3, 0xa6, 0xa9, 0xac, 0x9f, 0x9a, 0x95, 0x90, 0x8b, 0x8e, 0x81, 0x84, 0x47, 0x42, 0x4d, 0x48, 0x53, 0x56, 0x59, 0x5c, 0x6f, 0x6a, 0x65, 0x60, 0x7b, 0x7e, 0x71, 0x74, 0x17, 0x12, 0x1d, 0x18, 0x3, 0x6, 0x9, 0xc, 0x3f, 0x3a, 0x35, 0x30, 0x2b, 0x2e, 0x21, 0x24}, + {0x0, 0x6, 0xc, 0xa, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22, 0x60, 0x66, 0x6c, 0x6a, 0x78, 0x7e, 0x74, 0x72, 0x50, 0x56, 0x5c, 0x5a, 0x48, 0x4e, 0x44, 0x42, 0xc0, 0xc6, 0xcc, 0xca, 0xd8, 0xde, 0xd4, 0xd2, 0xf0, 0xf6, 0xfc, 0xfa, 0xe8, 0xee, 0xe4, 0xe2, 0xa0, 0xa6, 0xac, 0xaa, 0xb8, 0xbe, 0xb4, 0xb2, 0x90, 0x96, 0x9c, 0x9a, 0x88, 0x8e, 0x84, 0x82, 0x9d, 0x9b, 0x91, 0x97, 0x85, 0x83, 0x89, 0x8f, 0xad, 0xab, 0xa1, 0xa7, 0xb5, 0xb3, 0xb9, 0xbf, 0xfd, 0xfb, 0xf1, 0xf7, 0xe5, 0xe3, 0xe9, 0xef, 0xcd, 0xcb, 0xc1, 0xc7, 0xd5, 0xd3, 0xd9, 0xdf, 0x5d, 0x5b, 0x51, 0x57, 0x45, 0x43, 0x49, 0x4f, 0x6d, 0x6b, 0x61, 0x67, 0x75, 0x73, 0x79, 0x7f, 0x3d, 0x3b, 0x31, 0x37, 0x25, 0x23, 0x29, 0x2f, 0xd, 0xb, 0x1, 0x7, 0x15, 0x13, 0x19, 0x1f, 0x27, 0x21, 0x2b, 0x2d, 0x3f, 0x39, 0x33, 0x35, 0x17, 0x11, 0x1b, 0x1d, 0xf, 0x9, 0x3, 0x5, 0x47, 0x41, 0x4b, 0x4d, 0x5f, 0x59, 0x53, 0x55, 0x77, 0x71, 0x7b, 0x7d, 0x6f, 0x69, 0x63, 0x65, 0xe7, 0xe1, 0xeb, 0xed, 0xff, 0xf9, 0xf3, 0xf5, 0xd7, 0xd1, 0xdb, 0xdd, 0xcf, 0xc9, 0xc3, 0xc5, 0x87, 0x81, 0x8b, 0x8d, 0x9f, 0x99, 0x93, 0x95, 0xb7, 0xb1, 0xbb, 0xbd, 0xaf, 0xa9, 0xa3, 0xa5, 0xba, 0xbc, 0xb6, 0xb0, 0xa2, 0xa4, 0xae, 0xa8, 0x8a, 0x8c, 0x86, 0x80, 0x92, 0x94, 0x9e, 0x98, 0xda, 0xdc, 0xd6, 0xd0, 0xc2, 0xc4, 0xce, 0xc8, 0xea, 0xec, 0xe6, 0xe0, 0xf2, 0xf4, 0xfe, 0xf8, 0x7a, 0x7c, 0x76, 0x70, 0x62, 0x64, 0x6e, 0x68, 0x4a, 0x4c, 0x46, 0x40, 0x52, 0x54, 0x5e, 0x58, 0x1a, 0x1c, 0x16, 0x10, 0x2, 0x4, 0xe, 0x8, 0x2a, 0x2c, 0x26, 0x20, 0x32, 0x34, 0x3e, 0x38}, + {0x0, 0x7, 0xe, 0x9, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d, 0x70, 0x77, 0x7e, 0x79, 0x6c, 0x6b, 0x62, 0x65, 0x48, 0x4f, 0x46, 0x41, 0x54, 0x53, 0x5a, 0x5d, 0xe0, 0xe7, 0xee, 0xe9, 0xfc, 0xfb, 0xf2, 0xf5, 0xd8, 0xdf, 0xd6, 0xd1, 0xc4, 0xc3, 0xca, 0xcd, 0x90, 0x97, 0x9e, 0x99, 0x8c, 0x8b, 0x82, 0x85, 0xa8, 0xaf, 0xa6, 0xa1, 0xb4, 0xb3, 0xba, 0xbd, 0xdd, 0xda, 0xd3, 0xd4, 0xc1, 0xc6, 0xcf, 0xc8, 0xe5, 0xe2, 0xeb, 0xec, 0xf9, 0xfe, 0xf7, 0xf0, 0xad, 0xaa, 0xa3, 0xa4, 0xb1, 0xb6, 0xbf, 0xb8, 0x95, 0x92, 0x9b, 0x9c, 0x89, 0x8e, 0x87, 0x80, 0x3d, 0x3a, 0x33, 0x34, 0x21, 0x26, 0x2f, 0x28, 0x5, 0x2, 0xb, 0xc, 0x19, 0x1e, 0x17, 0x10, 0x4d, 0x4a, 0x43, 0x44, 0x51, 0x56, 0x5f, 0x58, 0x75, 0x72, 0x7b, 0x7c, 0x69, 0x6e, 0x67, 0x60, 0xa7, 0xa0, 0xa9, 0xae, 0xbb, 0xbc, 0xb5, 0xb2, 0x9f, 0x98, 0x91, 0x96, 0x83, 0x84, 0x8d, 0x8a, 0xd7, 0xd0, 0xd9, 0xde, 0xcb, 0xcc, 0xc5, 0xc2, 0xef, 0xe8, 0xe1, 0xe6, 0xf3, 0xf4, 0xfd, 0xfa, 0x47, 0x40, 0x49, 0x4e, 0x5b, 0x5c, 0x55, 0x52, 0x7f, 0x78, 0x71, 0x76, 0x63, 0x64, 0x6d, 0x6a, 0x37, 0x30, 0x39, 0x3e, 0x2b, 0x2c, 0x25, 0x22, 0xf, 0x8, 0x1, 0x6, 0x13, 0x14, 0x1d, 0x1a, 0x7a, 0x7d, 0x74, 0x73, 0x66, 0x61, 0x68, 0x6f, 0x42, 0x45, 0x4c, 0x4b, 0x5e, 0x59, 0x50, 0x57, 0xa, 0xd, 0x4, 0x3, 0x16, 0x11, 0x18, 0x1f, 0x32, 0x35, 0x3c, 0x3b, 0x2e, 0x29, 0x20, 0x27, 0x9a, 0x9d, 0x94, 0x93, 0x86, 0x81, 0x88, 0x8f, 0xa2, 0xa5, 0xac, 0xab, 0xbe, 0xb9, 0xb0, 0xb7, 0xea, 0xed, 0xe4, 0xe3, 0xf6, 0xf1, 0xf8, 0xff, 0xd2, 0xd5, 0xdc, 0xdb, 0xce, 0xc9, 0xc0, 0xc7}, + {0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78, 0x80, 0x88, 0x90, 0x98, 0xa0, 0xa8, 0xb0, 0xb8, 0xc0, 0xc8, 0xd0, 0xd8, 0xe0, 0xe8, 0xf0, 0xf8, 0x1d, 0x15, 0xd, 0x5, 0x3d, 0x35, 0x2d, 0x25, 0x5d, 0x55, 0x4d, 0x45, 0x7d, 0x75, 0x6d, 0x65, 0x9d, 0x95, 0x8d, 0x85, 0xbd, 0xb5, 0xad, 0xa5, 0xdd, 0xd5, 0xcd, 0xc5, 0xfd, 0xf5, 0xed, 0xe5, 0x3a, 0x32, 0x2a, 0x22, 0x1a, 0x12, 0xa, 0x2, 0x7a, 0x72, 0x6a, 0x62, 0x5a, 0x52, 0x4a, 0x42, 0xba, 0xb2, 0xaa, 0xa2, 0x9a, 0x92, 0x8a, 0x82, 0xfa, 0xf2, 0xea, 0xe2, 0xda, 0xd2, 0xca, 0xc2, 0x27, 0x2f, 0x37, 0x3f, 0x7, 0xf, 0x17, 0x1f, 0x67, 0x6f, 0x77, 0x7f, 0x47, 0x4f, 0x57, 0x5f, 0xa7, 0xaf, 0xb7, 0xbf, 0x87, 0x8f, 0x97, 0x9f, 0xe7, 0xef, 0xf7, 0xff, 0xc7, 0xcf, 0xd7, 0xdf, 0x74, 0x7c, 0x64, 0x6c, 0x54, 0x5c, 0x44, 0x4c, 0x34, 0x3c, 0x24, 0x2c, 0x14, 0x1c, 0x4, 0xc, 0xf4, 0xfc, 0xe4, 0xec, 0xd4, 0xdc, 0xc4, 0xcc, 0xb4, 0xbc, 0xa4, 0xac, 0x94, 0x9c, 0x84, 0x8c, 0x69, 0x61, 0x79, 0x71, 0x49, 0x41, 0x59, 0x51, 0x29, 0x21, 0x39, 0x31, 0x9, 0x1, 0x19, 0x11, 0xe9, 0xe1, 0xf9, 0xf1, 0xc9, 0xc1, 0xd9, 0xd1, 0xa9, 0xa1, 0xb9, 0xb1, 0x89, 0x81, 0x99, 0x91, 0x4e, 0x46, 0x5e, 0x56, 0x6e, 0x66, 0x7e, 0x76, 0xe, 0x6, 0x1e, 0x16, 0x2e, 0x26, 0x3e, 0x36, 0xce, 0xc6, 0xde, 0xd6, 0xee, 0xe6, 0xfe, 0xf6, 0x8e, 0x86, 0x9e, 0x96, 0xae, 0xa6, 0xbe, 0xb6, 0x53, 0x5b, 0x43, 0x4b, 0x73, 0x7b, 0x63, 0x6b, 0x13, 0x1b, 0x3, 0xb, 0x33, 0x3b, 0x23, 0x2b, 0xd3, 0xdb, 0xc3, 0xcb, 0xf3, 0xfb, 0xe3, 0xeb, 0x93, 0x9b, 0x83, 0x8b, 0xb3, 0xbb, 0xa3, 0xab}, + {0x0, 0x9, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77, 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, 0x3d, 0x34, 0x2f, 0x26, 0x19, 0x10, 0xb, 0x2, 0x75, 0x7c, 0x67, 0x6e, 0x51, 0x58, 0x43, 0x4a, 0xad, 0xa4, 0xbf, 0xb6, 0x89, 0x80, 0x9b, 0x92, 0xe5, 0xec, 0xf7, 0xfe, 0xc1, 0xc8, 0xd3, 0xda, 0x7a, 0x73, 0x68, 0x61, 0x5e, 0x57, 0x4c, 0x45, 0x32, 0x3b, 0x20, 0x29, 0x16, 0x1f, 0x4, 0xd, 0xea, 0xe3, 0xf8, 0xf1, 0xce, 0xc7, 0xdc, 0xd5, 0xa2, 0xab, 0xb0, 0xb9, 0x86, 0x8f, 0x94, 0x9d, 0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0xf, 0x6, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30, 0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0, 0xf4, 0xfd, 0xe6, 0xef, 0xd0, 0xd9, 0xc2, 0xcb, 0xbc, 0xb5, 0xae, 0xa7, 0x98, 0x91, 0x8a, 0x83, 0x64, 0x6d, 0x76, 0x7f, 0x40, 0x49, 0x52, 0x5b, 0x2c, 0x25, 0x3e, 0x37, 0x8, 0x1, 0x1a, 0x13, 0xc9, 0xc0, 0xdb, 0xd2, 0xed, 0xe4, 0xff, 0xf6, 0x81, 0x88, 0x93, 0x9a, 0xa5, 0xac, 0xb7, 0xbe, 0x59, 0x50, 0x4b, 0x42, 0x7d, 0x74, 0x6f, 0x66, 0x11, 0x18, 0x3, 0xa, 0x35, 0x3c, 0x27, 0x2e, 0x8e, 0x87, 0x9c, 0x95, 0xaa, 0xa3, 0xb8, 0xb1, 0xc6, 0xcf, 0xd4, 0xdd, 0xe2, 0xeb, 0xf0, 0xf9, 0x1e, 0x17, 0xc, 0x5, 0x3a, 0x33, 0x28, 0x21, 0x56, 0x5f, 0x44, 0x4d, 0x72, 0x7b, 0x60, 0x69, 0xb3, 0xba, 0xa1, 0xa8, 0x97, 0x9e, 0x85, 0x8c, 0xfb, 0xf2, 0xe9, 0xe0, 0xdf, 0xd6, 0xcd, 0xc4, 0x23, 0x2a, 0x31, 0x38, 0x7, 0xe, 0x15, 0x1c, 0x6b, 0x62, 0x79, 0x70, 0x4f, 0x46, 0x5d, 0x54}, + {0x0, 0xa, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66, 0xa0, 0xaa, 0xb4, 0xbe, 0x88, 0x82, 0x9c, 0x96, 0xf0, 0xfa, 0xe4, 0xee, 0xd8, 0xd2, 0xcc, 0xc6, 0x5d, 0x57, 0x49, 0x43, 0x75, 0x7f, 0x61, 0x6b, 0xd, 0x7, 0x19, 0x13, 0x25, 0x2f, 0x31, 0x3b, 0xfd, 0xf7, 0xe9, 0xe3, 0xd5, 0xdf, 0xc1, 0xcb, 0xad, 0xa7, 0xb9, 0xb3, 0x85, 0x8f, 0x91, 0x9b, 0xba, 0xb0, 0xae, 0xa4, 0x92, 0x98, 0x86, 0x8c, 0xea, 0xe0, 0xfe, 0xf4, 0xc2, 0xc8, 0xd6, 0xdc, 0x1a, 0x10, 0xe, 0x4, 0x32, 0x38, 0x26, 0x2c, 0x4a, 0x40, 0x5e, 0x54, 0x62, 0x68, 0x76, 0x7c, 0xe7, 0xed, 0xf3, 0xf9, 0xcf, 0xc5, 0xdb, 0xd1, 0xb7, 0xbd, 0xa3, 0xa9, 0x9f, 0x95, 0x8b, 0x81, 0x47, 0x4d, 0x53, 0x59, 0x6f, 0x65, 0x7b, 0x71, 0x17, 0x1d, 0x3, 0x9, 0x3f, 0x35, 0x2b, 0x21, 0x69, 0x63, 0x7d, 0x77, 0x41, 0x4b, 0x55, 0x5f, 0x39, 0x33, 0x2d, 0x27, 0x11, 0x1b, 0x5, 0xf, 0xc9, 0xc3, 0xdd, 0xd7, 0xe1, 0xeb, 0xf5, 0xff, 0x99, 0x93, 0x8d, 0x87, 0xb1, 0xbb, 0xa5, 0xaf, 0x34, 0x3e, 0x20, 0x2a, 0x1c, 0x16, 0x8, 0x2, 0x64, 0x6e, 0x70, 0x7a, 0x4c, 0x46, 0x58, 0x52, 0x94, 0x9e, 0x80, 0x8a, 0xbc, 0xb6, 0xa8, 0xa2, 0xc4, 0xce, 0xd0, 0xda, 0xec, 0xe6, 0xf8, 0xf2, 0xd3, 0xd9, 0xc7, 0xcd, 0xfb, 0xf1, 0xef, 0xe5, 0x83, 0x89, 0x97, 0x9d, 0xab, 0xa1, 0xbf, 0xb5, 0x73, 0x79, 0x67, 0x6d, 0x5b, 0x51, 0x4f, 0x45, 0x23, 0x29, 0x37, 0x3d, 0xb, 0x1, 0x1f, 0x15, 0x8e, 0x84, 0x9a, 0x90, 0xa6, 0xac, 0xb2, 0xb8, 0xde, 0xd4, 0xca, 0xc0, 0xf6, 0xfc, 0xe2, 0xe8, 0x2e, 0x24, 0x3a, 0x30, 0x6, 0xc, 0x12, 0x18, 0x7e, 0x74, 0x6a, 0x60, 0x56, 0x5c, 0x42, 0x48}, + {0x0, 0xb, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69, 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, 0x7d, 0x76, 0x6b, 0x60, 0x51, 0x5a, 0x47, 0x4c, 0x25, 0x2e, 0x33, 0x38, 0x9, 0x2, 0x1f, 0x14, 0xcd, 0xc6, 0xdb, 0xd0, 0xe1, 0xea, 0xf7, 0xfc, 0x95, 0x9e, 0x83, 0x88, 0xb9, 0xb2, 0xaf, 0xa4, 0xfa, 0xf1, 0xec, 0xe7, 0xd6, 0xdd, 0xc0, 0xcb, 0xa2, 0xa9, 0xb4, 0xbf, 0x8e, 0x85, 0x98, 0x93, 0x4a, 0x41, 0x5c, 0x57, 0x66, 0x6d, 0x70, 0x7b, 0x12, 0x19, 0x4, 0xf, 0x3e, 0x35, 0x28, 0x23, 0x87, 0x8c, 0x91, 0x9a, 0xab, 0xa0, 0xbd, 0xb6, 0xdf, 0xd4, 0xc9, 0xc2, 0xf3, 0xf8, 0xe5, 0xee, 0x37, 0x3c, 0x21, 0x2a, 0x1b, 0x10, 0xd, 0x6, 0x6f, 0x64, 0x79, 0x72, 0x43, 0x48, 0x55, 0x5e, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8, 0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68, 0x1, 0xa, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x94, 0x9f, 0x82, 0x89, 0xb8, 0xb3, 0xae, 0xa5, 0xcc, 0xc7, 0xda, 0xd1, 0xe0, 0xeb, 0xf6, 0xfd, 0x24, 0x2f, 0x32, 0x39, 0x8, 0x3, 0x1e, 0x15, 0x7c, 0x77, 0x6a, 0x61, 0x50, 0x5b, 0x46, 0x4d, 0x13, 0x18, 0x5, 0xe, 0x3f, 0x34, 0x29, 0x22, 0x4b, 0x40, 0x5d, 0x56, 0x67, 0x6c, 0x71, 0x7a, 0xa3, 0xa8, 0xb5, 0xbe, 0x8f, 0x84, 0x99, 0x92, 0xfb, 0xf0, 0xed, 0xe6, 0xd7, 0xdc, 0xc1, 0xca, 0x6e, 0x65, 0x78, 0x73, 0x42, 0x49, 0x54, 0x5f, 0x36, 0x3d, 0x20, 0x2b, 0x1a, 0x11, 0xc, 0x7, 0xde, 0xd5, 0xc8, 0xc3, 0xf2, 0xf9, 0xe4, 0xef, 0x86, 0x8d, 0x90, 0x9b, 0xaa, 0xa1, 0xbc, 0xb7}, + {0x0, 0xc, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44, 0xc0, 0xcc, 0xd8, 0xd4, 0xf0, 0xfc, 0xe8, 0xe4, 0xa0, 0xac, 0xb8, 0xb4, 0x90, 0x9c, 0x88, 0x84, 0x9d, 0x91, 0x85, 0x89, 0xad, 0xa1, 0xb5, 0xb9, 0xfd, 0xf1, 0xe5, 0xe9, 0xcd, 0xc1, 0xd5, 0xd9, 0x5d, 0x51, 0x45, 0x49, 0x6d, 0x61, 0x75, 0x79, 0x3d, 0x31, 0x25, 0x29, 0xd, 0x1, 0x15, 0x19, 0x27, 0x2b, 0x3f, 0x33, 0x17, 0x1b, 0xf, 0x3, 0x47, 0x4b, 0x5f, 0x53, 0x77, 0x7b, 0x6f, 0x63, 0xe7, 0xeb, 0xff, 0xf3, 0xd7, 0xdb, 0xcf, 0xc3, 0x87, 0x8b, 0x9f, 0x93, 0xb7, 0xbb, 0xaf, 0xa3, 0xba, 0xb6, 0xa2, 0xae, 0x8a, 0x86, 0x92, 0x9e, 0xda, 0xd6, 0xc2, 0xce, 0xea, 0xe6, 0xf2, 0xfe, 0x7a, 0x76, 0x62, 0x6e, 0x4a, 0x46, 0x52, 0x5e, 0x1a, 0x16, 0x2, 0xe, 0x2a, 0x26, 0x32, 0x3e, 0x4e, 0x42, 0x56, 0x5a, 0x7e, 0x72, 0x66, 0x6a, 0x2e, 0x22, 0x36, 0x3a, 0x1e, 0x12, 0x6, 0xa, 0x8e, 0x82, 0x96, 0x9a, 0xbe, 0xb2, 0xa6, 0xaa, 0xee, 0xe2, 0xf6, 0xfa, 0xde, 0xd2, 0xc6, 0xca, 0xd3, 0xdf, 0xcb, 0xc7, 0xe3, 0xef, 0xfb, 0xf7, 0xb3, 0xbf, 0xab, 0xa7, 0x83, 0x8f, 0x9b, 0x97, 0x13, 0x1f, 0xb, 0x7, 0x23, 0x2f, 0x3b, 0x37, 0x73, 0x7f, 0x6b, 0x67, 0x43, 0x4f, 0x5b, 0x57, 0x69, 0x65, 0x71, 0x7d, 0x59, 0x55, 0x41, 0x4d, 0x9, 0x5, 0x11, 0x1d, 0x39, 0x35, 0x21, 0x2d, 0xa9, 0xa5, 0xb1, 0xbd, 0x99, 0x95, 0x81, 0x8d, 0xc9, 0xc5, 0xd1, 0xdd, 0xf9, 0xf5, 0xe1, 0xed, 0xf4, 0xf8, 0xec, 0xe0, 0xc4, 0xc8, 0xdc, 0xd0, 0x94, 0x98, 0x8c, 0x80, 0xa4, 0xa8, 0xbc, 0xb0, 0x34, 0x38, 0x2c, 0x20, 0x4, 0x8, 0x1c, 0x10, 0x54, 0x58, 0x4c, 0x40, 0x64, 0x68, 0x7c, 0x70}, + {0x0, 0xd, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b, 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, 0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6, 0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x5, 0x8, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26, 0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0xf, 0x2, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c, 0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc, 0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91, 0xa, 0x7, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41, 0xce, 0xc3, 0xd4, 0xd9, 0xfa, 0xf7, 0xe0, 0xed, 0xa6, 0xab, 0xbc, 0xb1, 0x92, 0x9f, 0x88, 0x85, 0x1e, 0x13, 0x4, 0x9, 0x2a, 0x27, 0x30, 0x3d, 0x76, 0x7b, 0x6c, 0x61, 0x42, 0x4f, 0x58, 0x55, 0x73, 0x7e, 0x69, 0x64, 0x47, 0x4a, 0x5d, 0x50, 0x1b, 0x16, 0x1, 0xc, 0x2f, 0x22, 0x35, 0x38, 0xa3, 0xae, 0xb9, 0xb4, 0x97, 0x9a, 0x8d, 0x80, 0xcb, 0xc6, 0xd1, 0xdc, 0xff, 0xf2, 0xe5, 0xe8, 0xa9, 0xa4, 0xb3, 0xbe, 0x9d, 0x90, 0x87, 0x8a, 0xc1, 0xcc, 0xdb, 0xd6, 0xf5, 0xf8, 0xef, 0xe2, 0x79, 0x74, 0x63, 0x6e, 0x4d, 0x40, 0x57, 0x5a, 0x11, 0x1c, 0xb, 0x6, 0x25, 0x28, 0x3f, 0x32, 0x14, 0x19, 0xe, 0x3, 0x20, 0x2d, 0x3a, 0x37, 0x7c, 0x71, 0x66, 0x6b, 0x48, 0x45, 0x52, 0x5f, 0xc4, 0xc9, 0xde, 0xd3, 0xf0, 0xfd, 0xea, 0xe7, 0xac, 0xa1, 0xb6, 0xbb, 0x98, 0x95, 0x82, 0x8f}, + {0x0, 0xe, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a, 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7, 0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0x3d, 0x33, 0x21, 0x2f, 0x5, 0xb, 0x19, 0x17, 0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d, 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d, 0x37, 0x39, 0x2b, 0x25, 0xf, 0x1, 0x13, 0x1d, 0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0xa, 0x4, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20, 0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0, 0x53, 0x5d, 0x4f, 0x41, 0x6b, 0x65, 0x77, 0x79, 0x23, 0x2d, 0x3f, 0x31, 0x1b, 0x15, 0x7, 0x9, 0xb3, 0xbd, 0xaf, 0xa1, 0x8b, 0x85, 0x97, 0x99, 0xc3, 0xcd, 0xdf, 0xd1, 0xfb, 0xf5, 0xe7, 0xe9, 0x8e, 0x80, 0x92, 0x9c, 0xb6, 0xb8, 0xaa, 0xa4, 0xfe, 0xf0, 0xe2, 0xec, 0xc6, 0xc8, 0xda, 0xd4, 0x6e, 0x60, 0x72, 0x7c, 0x56, 0x58, 0x4a, 0x44, 0x1e, 0x10, 0x2, 0xc, 0x26, 0x28, 0x3a, 0x34, 0xf4, 0xfa, 0xe8, 0xe6, 0xcc, 0xc2, 0xd0, 0xde, 0x84, 0x8a, 0x98, 0x96, 0xbc, 0xb2, 0xa0, 0xae, 0x14, 0x1a, 0x8, 0x6, 0x2c, 0x22, 0x30, 0x3e, 0x64, 0x6a, 0x78, 0x76, 0x5c, 0x52, 0x40, 0x4e, 0x29, 0x27, 0x35, 0x3b, 0x11, 0x1f, 0xd, 0x3, 0x59, 0x57, 0x45, 0x4b, 0x61, 0x6f, 0x7d, 0x73, 0xc9, 0xc7, 0xd5, 0xdb, 0xf1, 0xff, 0xed, 0xe3, 0xb9, 0xb7, 0xa5, 0xab, 0x81, 0x8f, 0x9d, 0x93}, + {0x0, 0xf, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55, 0xf0, 0xff, 0xee, 0xe1, 0xcc, 0xc3, 0xd2, 0xdd, 0x88, 0x87, 0x96, 0x99, 0xb4, 0xbb, 0xaa, 0xa5, 0xfd, 0xf2, 0xe3, 0xec, 0xc1, 0xce, 0xdf, 0xd0, 0x85, 0x8a, 0x9b, 0x94, 0xb9, 0xb6, 0xa7, 0xa8, 0xd, 0x2, 0x13, 0x1c, 0x31, 0x3e, 0x2f, 0x20, 0x75, 0x7a, 0x6b, 0x64, 0x49, 0x46, 0x57, 0x58, 0xe7, 0xe8, 0xf9, 0xf6, 0xdb, 0xd4, 0xc5, 0xca, 0x9f, 0x90, 0x81, 0x8e, 0xa3, 0xac, 0xbd, 0xb2, 0x17, 0x18, 0x9, 0x6, 0x2b, 0x24, 0x35, 0x3a, 0x6f, 0x60, 0x71, 0x7e, 0x53, 0x5c, 0x4d, 0x42, 0x1a, 0x15, 0x4, 0xb, 0x26, 0x29, 0x38, 0x37, 0x62, 0x6d, 0x7c, 0x73, 0x5e, 0x51, 0x40, 0x4f, 0xea, 0xe5, 0xf4, 0xfb, 0xd6, 0xd9, 0xc8, 0xc7, 0x92, 0x9d, 0x8c, 0x83, 0xae, 0xa1, 0xb0, 0xbf, 0xd3, 0xdc, 0xcd, 0xc2, 0xef, 0xe0, 0xf1, 0xfe, 0xab, 0xa4, 0xb5, 0xba, 0x97, 0x98, 0x89, 0x86, 0x23, 0x2c, 0x3d, 0x32, 0x1f, 0x10, 0x1, 0xe, 0x5b, 0x54, 0x45, 0x4a, 0x67, 0x68, 0x79, 0x76, 0x2e, 0x21, 0x30, 0x3f, 0x12, 0x1d, 0xc, 0x3, 0x56, 0x59, 0x48, 0x47, 0x6a, 0x65, 0x74, 0x7b, 0xde, 0xd1, 0xc0, 0xcf, 0xe2, 0xed, 0xfc, 0xf3, 0xa6, 0xa9, 0xb8, 0xb7, 0x9a, 0x95, 0x84, 0x8b, 0x34, 0x3b, 0x2a, 0x25, 0x8, 0x7, 0x16, 0x19, 0x4c, 0x43, 0x52, 0x5d, 0x70, 0x7f, 0x6e, 0x61, 0xc4, 0xcb, 0xda, 0xd5, 0xf8, 0xf7, 0xe6, 0xe9, 0xbc, 0xb3, 0xa2, 0xad, 0x80, 0x8f, 0x9e, 0x91, 0xc9, 0xc6, 0xd7, 0xd8, 0xf5, 0xfa, 0xeb, 0xe4, 0xb1, 0xbe, 0xaf, 0xa0, 0x8d, 0x82, 0x93, 0x9c, 0x39, 0x36, 0x27, 0x28, 0x5, 0xa, 0x1b, 0x14, 0x41, 0x4e, 0x5f, 0x50, 0x7d, 0x72, 0x63, 0x6c}, + {0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0x1d, 0xd, 0x3d, 0x2d, 0x5d, 0x4d, 0x7d, 0x6d, 0x9d, 0x8d, 0xbd, 0xad, 0xdd, 0xcd, 0xfd, 0xed, 0x3a, 0x2a, 0x1a, 0xa, 0x7a, 0x6a, 0x5a, 0x4a, 0xba, 0xaa, 0x9a, 0x8a, 0xfa, 0xea, 0xda, 0xca, 0x27, 0x37, 0x7, 0x17, 0x67, 0x77, 0x47, 0x57, 0xa7, 0xb7, 0x87, 0x97, 0xe7, 0xf7, 0xc7, 0xd7, 0x74, 0x64, 0x54, 0x44, 0x34, 0x24, 0x14, 0x4, 0xf4, 0xe4, 0xd4, 0xc4, 0xb4, 0xa4, 0x94, 0x84, 0x69, 0x79, 0x49, 0x59, 0x29, 0x39, 0x9, 0x19, 0xe9, 0xf9, 0xc9, 0xd9, 0xa9, 0xb9, 0x89, 0x99, 0x4e, 0x5e, 0x6e, 0x7e, 0xe, 0x1e, 0x2e, 0x3e, 0xce, 0xde, 0xee, 0xfe, 0x8e, 0x9e, 0xae, 0xbe, 0x53, 0x43, 0x73, 0x63, 0x13, 0x3, 0x33, 0x23, 0xd3, 0xc3, 0xf3, 0xe3, 0x93, 0x83, 0xb3, 0xa3, 0xe8, 0xf8, 0xc8, 0xd8, 0xa8, 0xb8, 0x88, 0x98, 0x68, 0x78, 0x48, 0x58, 0x28, 0x38, 0x8, 0x18, 0xf5, 0xe5, 0xd5, 0xc5, 0xb5, 0xa5, 0x95, 0x85, 0x75, 0x65, 0x55, 0x45, 0x35, 0x25, 0x15, 0x5, 0xd2, 0xc2, 0xf2, 0xe2, 0x92, 0x82, 0xb2, 0xa2, 0x52, 0x42, 0x72, 0x62, 0x12, 0x2, 0x32, 0x22, 0xcf, 0xdf, 0xef, 0xff, 0x8f, 0x9f, 0xaf, 0xbf, 0x4f, 0x5f, 0x6f, 0x7f, 0xf, 0x1f, 0x2f, 0x3f, 0x9c, 0x8c, 0xbc, 0xac, 0xdc, 0xcc, 0xfc, 0xec, 0x1c, 0xc, 0x3c, 0x2c, 0x5c, 0x4c, 0x7c, 0x6c, 0x81, 0x91, 0xa1, 0xb1, 0xc1, 0xd1, 0xe1, 0xf1, 0x1, 0x11, 0x21, 0x31, 0x41, 0x51, 0x61, 0x71, 0xa6, 0xb6, 0x86, 0x96, 0xe6, 0xf6, 0xc6, 0xd6, 0x26, 0x36, 0x6, 0x16, 0x66, 0x76, 0x46, 0x56, 0xbb, 0xab, 0x9b, 0x8b, 0xfb, 0xeb, 0xdb, 0xcb, 0x3b, 0x2b, 0x1b, 0xb, 0x7b, 0x6b, 0x5b, 0x4b}, + {0x0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0xd, 0x1c, 0x2f, 0x3e, 0x49, 0x58, 0x6b, 0x7a, 0x85, 0x94, 0xa7, 0xb6, 0xc1, 0xd0, 0xe3, 0xf2, 0x1a, 0xb, 0x38, 0x29, 0x5e, 0x4f, 0x7c, 0x6d, 0x92, 0x83, 0xb0, 0xa1, 0xd6, 0xc7, 0xf4, 0xe5, 0x17, 0x6, 0x35, 0x24, 0x53, 0x42, 0x71, 0x60, 0x9f, 0x8e, 0xbd, 0xac, 0xdb, 0xca, 0xf9, 0xe8, 0x34, 0x25, 0x16, 0x7, 0x70, 0x61, 0x52, 0x43, 0xbc, 0xad, 0x9e, 0x8f, 0xf8, 0xe9, 0xda, 0xcb, 0x39, 0x28, 0x1b, 0xa, 0x7d, 0x6c, 0x5f, 0x4e, 0xb1, 0xa0, 0x93, 0x82, 0xf5, 0xe4, 0xd7, 0xc6, 0x2e, 0x3f, 0xc, 0x1d, 0x6a, 0x7b, 0x48, 0x59, 0xa6, 0xb7, 0x84, 0x95, 0xe2, 0xf3, 0xc0, 0xd1, 0x23, 0x32, 0x1, 0x10, 0x67, 0x76, 0x45, 0x54, 0xab, 0xba, 0x89, 0x98, 0xef, 0xfe, 0xcd, 0xdc, 0x68, 0x79, 0x4a, 0x5b, 0x2c, 0x3d, 0xe, 0x1f, 0xe0, 0xf1, 0xc2, 0xd3, 0xa4, 0xb5, 0x86, 0x97, 0x65, 0x74, 0x47, 0x56, 0x21, 0x30, 0x3, 0x12, 0xed, 0xfc, 0xcf, 0xde, 0xa9, 0xb8, 0x8b, 0x9a, 0x72, 0x63, 0x50, 0x41, 0x36, 0x27, 0x14, 0x5, 0xfa, 0xeb, 0xd8, 0xc9, 0xbe, 0xaf, 0x9c, 0x8d, 0x7f, 0x6e, 0x5d, 0x4c, 0x3b, 0x2a, 0x19, 0x8, 0xf7, 0xe6, 0xd5, 0xc4, 0xb3, 0xa2, 0x91, 0x80, 0x5c, 0x4d, 0x7e, 0x6f, 0x18, 0x9, 0x3a, 0x2b, 0xd4, 0xc5, 0xf6, 0xe7, 0x90, 0x81, 0xb2, 0xa3, 0x51, 0x40, 0x73, 0x62, 0x15, 0x4, 0x37, 0x26, 0xd9, 0xc8, 0xfb, 0xea, 0x9d, 0x8c, 0xbf, 0xae, 0x46, 0x57, 0x64, 0x75, 0x2, 0x13, 0x20, 0x31, 0xce, 0xdf, 0xec, 0xfd, 0x8a, 0x9b, 0xa8, 0xb9, 0x4b, 0x5a, 0x69, 0x78, 0xf, 0x1e, 0x2d, 0x3c, 0xc3, 0xd2, 0xe1, 0xf0, 0x87, 0x96, 0xa5, 0xb4}, + {0x0, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee, 0x3d, 0x2f, 0x19, 0xb, 0x75, 0x67, 0x51, 0x43, 0xad, 0xbf, 0x89, 0x9b, 0xe5, 0xf7, 0xc1, 0xd3, 0x7a, 0x68, 0x5e, 0x4c, 0x32, 0x20, 0x16, 0x4, 0xea, 0xf8, 0xce, 0xdc, 0xa2, 0xb0, 0x86, 0x94, 0x47, 0x55, 0x63, 0x71, 0xf, 0x1d, 0x2b, 0x39, 0xd7, 0xc5, 0xf3, 0xe1, 0x9f, 0x8d, 0xbb, 0xa9, 0xf4, 0xe6, 0xd0, 0xc2, 0xbc, 0xae, 0x98, 0x8a, 0x64, 0x76, 0x40, 0x52, 0x2c, 0x3e, 0x8, 0x1a, 0xc9, 0xdb, 0xed, 0xff, 0x81, 0x93, 0xa5, 0xb7, 0x59, 0x4b, 0x7d, 0x6f, 0x11, 0x3, 0x35, 0x27, 0x8e, 0x9c, 0xaa, 0xb8, 0xc6, 0xd4, 0xe2, 0xf0, 0x1e, 0xc, 0x3a, 0x28, 0x56, 0x44, 0x72, 0x60, 0xb3, 0xa1, 0x97, 0x85, 0xfb, 0xe9, 0xdf, 0xcd, 0x23, 0x31, 0x7, 0x15, 0x6b, 0x79, 0x4f, 0x5d, 0xf5, 0xe7, 0xd1, 0xc3, 0xbd, 0xaf, 0x99, 0x8b, 0x65, 0x77, 0x41, 0x53, 0x2d, 0x3f, 0x9, 0x1b, 0xc8, 0xda, 0xec, 0xfe, 0x80, 0x92, 0xa4, 0xb6, 0x58, 0x4a, 0x7c, 0x6e, 0x10, 0x2, 0x34, 0x26, 0x8f, 0x9d, 0xab, 0xb9, 0xc7, 0xd5, 0xe3, 0xf1, 0x1f, 0xd, 0x3b, 0x29, 0x57, 0x45, 0x73, 0x61, 0xb2, 0xa0, 0x96, 0x84, 0xfa, 0xe8, 0xde, 0xcc, 0x22, 0x30, 0x6, 0x14, 0x6a, 0x78, 0x4e, 0x5c, 0x1, 0x13, 0x25, 0x37, 0x49, 0x5b, 0x6d, 0x7f, 0x91, 0x83, 0xb5, 0xa7, 0xd9, 0xcb, 0xfd, 0xef, 0x3c, 0x2e, 0x18, 0xa, 0x74, 0x66, 0x50, 0x42, 0xac, 0xbe, 0x88, 0x9a, 0xe4, 0xf6, 0xc0, 0xd2, 0x7b, 0x69, 0x5f, 0x4d, 0x33, 0x21, 0x17, 0x5, 0xeb, 0xf9, 0xcf, 0xdd, 0xa3, 0xb1, 0x87, 0x95, 0x46, 0x54, 0x62, 0x70, 0xe, 0x1c, 0x2a, 0x38, 0xd6, 0xc4, 0xf2, 0xe0, 0x9e, 0x8c, 0xba, 0xa8}, + {0x0, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1, 0x2d, 0x3e, 0xb, 0x18, 0x61, 0x72, 0x47, 0x54, 0xb5, 0xa6, 0x93, 0x80, 0xf9, 0xea, 0xdf, 0xcc, 0x5a, 0x49, 0x7c, 0x6f, 0x16, 0x5, 0x30, 0x23, 0xc2, 0xd1, 0xe4, 0xf7, 0x8e, 0x9d, 0xa8, 0xbb, 0x77, 0x64, 0x51, 0x42, 0x3b, 0x28, 0x1d, 0xe, 0xef, 0xfc, 0xc9, 0xda, 0xa3, 0xb0, 0x85, 0x96, 0xb4, 0xa7, 0x92, 0x81, 0xf8, 0xeb, 0xde, 0xcd, 0x2c, 0x3f, 0xa, 0x19, 0x60, 0x73, 0x46, 0x55, 0x99, 0x8a, 0xbf, 0xac, 0xd5, 0xc6, 0xf3, 0xe0, 0x1, 0x12, 0x27, 0x34, 0x4d, 0x5e, 0x6b, 0x78, 0xee, 0xfd, 0xc8, 0xdb, 0xa2, 0xb1, 0x84, 0x97, 0x76, 0x65, 0x50, 0x43, 0x3a, 0x29, 0x1c, 0xf, 0xc3, 0xd0, 0xe5, 0xf6, 0x8f, 0x9c, 0xa9, 0xba, 0x5b, 0x48, 0x7d, 0x6e, 0x17, 0x4, 0x31, 0x22, 0x75, 0x66, 0x53, 0x40, 0x39, 0x2a, 0x1f, 0xc, 0xed, 0xfe, 0xcb, 0xd8, 0xa1, 0xb2, 0x87, 0x94, 0x58, 0x4b, 0x7e, 0x6d, 0x14, 0x7, 0x32, 0x21, 0xc0, 0xd3, 0xe6, 0xf5, 0x8c, 0x9f, 0xaa, 0xb9, 0x2f, 0x3c, 0x9, 0x1a, 0x63, 0x70, 0x45, 0x56, 0xb7, 0xa4, 0x91, 0x82, 0xfb, 0xe8, 0xdd, 0xce, 0x2, 0x11, 0x24, 0x37, 0x4e, 0x5d, 0x68, 0x7b, 0x9a, 0x89, 0xbc, 0xaf, 0xd6, 0xc5, 0xf0, 0xe3, 0xc1, 0xd2, 0xe7, 0xf4, 0x8d, 0x9e, 0xab, 0xb8, 0x59, 0x4a, 0x7f, 0x6c, 0x15, 0x6, 0x33, 0x20, 0xec, 0xff, 0xca, 0xd9, 0xa0, 0xb3, 0x86, 0x95, 0x74, 0x67, 0x52, 0x41, 0x38, 0x2b, 0x1e, 0xd, 0x9b, 0x88, 0xbd, 0xae, 0xd7, 0xc4, 0xf1, 0xe2, 0x3, 0x10, 0x25, 0x36, 0x4f, 0x5c, 0x69, 0x7a, 0xb6, 0xa5, 0x90, 0x83, 0xfa, 0xe9, 0xdc, 0xcf, 0x2e, 0x3d, 0x8, 0x1b, 0x62, 0x71, 0x44, 0x57}, + {0x0, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc, 0x5d, 0x49, 0x75, 0x61, 0xd, 0x19, 0x25, 0x31, 0xfd, 0xe9, 0xd5, 0xc1, 0xad, 0xb9, 0x85, 0x91, 0xba, 0xae, 0x92, 0x86, 0xea, 0xfe, 0xc2, 0xd6, 0x1a, 0xe, 0x32, 0x26, 0x4a, 0x5e, 0x62, 0x76, 0xe7, 0xf3, 0xcf, 0xdb, 0xb7, 0xa3, 0x9f, 0x8b, 0x47, 0x53, 0x6f, 0x7b, 0x17, 0x3, 0x3f, 0x2b, 0x69, 0x7d, 0x41, 0x55, 0x39, 0x2d, 0x11, 0x5, 0xc9, 0xdd, 0xe1, 0xf5, 0x99, 0x8d, 0xb1, 0xa5, 0x34, 0x20, 0x1c, 0x8, 0x64, 0x70, 0x4c, 0x58, 0x94, 0x80, 0xbc, 0xa8, 0xc4, 0xd0, 0xec, 0xf8, 0xd3, 0xc7, 0xfb, 0xef, 0x83, 0x97, 0xab, 0xbf, 0x73, 0x67, 0x5b, 0x4f, 0x23, 0x37, 0xb, 0x1f, 0x8e, 0x9a, 0xa6, 0xb2, 0xde, 0xca, 0xf6, 0xe2, 0x2e, 0x3a, 0x6, 0x12, 0x7e, 0x6a, 0x56, 0x42, 0xd2, 0xc6, 0xfa, 0xee, 0x82, 0x96, 0xaa, 0xbe, 0x72, 0x66, 0x5a, 0x4e, 0x22, 0x36, 0xa, 0x1e, 0x8f, 0x9b, 0xa7, 0xb3, 0xdf, 0xcb, 0xf7, 0xe3, 0x2f, 0x3b, 0x7, 0x13, 0x7f, 0x6b, 0x57, 0x43, 0x68, 0x7c, 0x40, 0x54, 0x38, 0x2c, 0x10, 0x4, 0xc8, 0xdc, 0xe0, 0xf4, 0x98, 0x8c, 0xb0, 0xa4, 0x35, 0x21, 0x1d, 0x9, 0x65, 0x71, 0x4d, 0x59, 0x95, 0x81, 0xbd, 0xa9, 0xc5, 0xd1, 0xed, 0xf9, 0xbb, 0xaf, 0x93, 0x87, 0xeb, 0xff, 0xc3, 0xd7, 0x1b, 0xf, 0x33, 0x27, 0x4b, 0x5f, 0x63, 0x77, 0xe6, 0xf2, 0xce, 0xda, 0xb6, 0xa2, 0x9e, 0x8a, 0x46, 0x52, 0x6e, 0x7a, 0x16, 0x2, 0x3e, 0x2a, 0x1, 0x15, 0x29, 0x3d, 0x51, 0x45, 0x79, 0x6d, 0xa1, 0xb5, 0x89, 0x9d, 0xf1, 0xe5, 0xd9, 0xcd, 0x5c, 0x48, 0x74, 0x60, 0xc, 0x18, 0x24, 0x30, 0xfc, 0xe8, 0xd4, 0xc0, 0xac, 0xb8, 0x84, 0x90}, + {0x0, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3, 0x4d, 0x58, 0x67, 0x72, 0x19, 0xc, 0x33, 0x26, 0xe5, 0xf0, 0xcf, 0xda, 0xb1, 0xa4, 0x9b, 0x8e, 0x9a, 0x8f, 0xb0, 0xa5, 0xce, 0xdb, 0xe4, 0xf1, 0x32, 0x27, 0x18, 0xd, 0x66, 0x73, 0x4c, 0x59, 0xd7, 0xc2, 0xfd, 0xe8, 0x83, 0x96, 0xa9, 0xbc, 0x7f, 0x6a, 0x55, 0x40, 0x2b, 0x3e, 0x1, 0x14, 0x29, 0x3c, 0x3, 0x16, 0x7d, 0x68, 0x57, 0x42, 0x81, 0x94, 0xab, 0xbe, 0xd5, 0xc0, 0xff, 0xea, 0x64, 0x71, 0x4e, 0x5b, 0x30, 0x25, 0x1a, 0xf, 0xcc, 0xd9, 0xe6, 0xf3, 0x98, 0x8d, 0xb2, 0xa7, 0xb3, 0xa6, 0x99, 0x8c, 0xe7, 0xf2, 0xcd, 0xd8, 0x1b, 0xe, 0x31, 0x24, 0x4f, 0x5a, 0x65, 0x70, 0xfe, 0xeb, 0xd4, 0xc1, 0xaa, 0xbf, 0x80, 0x95, 0x56, 0x43, 0x7c, 0x69, 0x2, 0x17, 0x28, 0x3d, 0x52, 0x47, 0x78, 0x6d, 0x6, 0x13, 0x2c, 0x39, 0xfa, 0xef, 0xd0, 0xc5, 0xae, 0xbb, 0x84, 0x91, 0x1f, 0xa, 0x35, 0x20, 0x4b, 0x5e, 0x61, 0x74, 0xb7, 0xa2, 0x9d, 0x88, 0xe3, 0xf6, 0xc9, 0xdc, 0xc8, 0xdd, 0xe2, 0xf7, 0x9c, 0x89, 0xb6, 0xa3, 0x60, 0x75, 0x4a, 0x5f, 0x34, 0x21, 0x1e, 0xb, 0x85, 0x90, 0xaf, 0xba, 0xd1, 0xc4, 0xfb, 0xee, 0x2d, 0x38, 0x7, 0x12, 0x79, 0x6c, 0x53, 0x46, 0x7b, 0x6e, 0x51, 0x44, 0x2f, 0x3a, 0x5, 0x10, 0xd3, 0xc6, 0xf9, 0xec, 0x87, 0x92, 0xad, 0xb8, 0x36, 0x23, 0x1c, 0x9, 0x62, 0x77, 0x48, 0x5d, 0x9e, 0x8b, 0xb4, 0xa1, 0xca, 0xdf, 0xe0, 0xf5, 0xe1, 0xf4, 0xcb, 0xde, 0xb5, 0xa0, 0x9f, 0x8a, 0x49, 0x5c, 0x63, 0x76, 0x1d, 0x8, 0x37, 0x22, 0xac, 0xb9, 0x86, 0x93, 0xf8, 0xed, 0xd2, 0xc7, 0x4, 0x11, 0x2e, 0x3b, 0x50, 0x45, 0x7a, 0x6f}, + {0x0, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2, 0x7d, 0x6b, 0x51, 0x47, 0x25, 0x33, 0x9, 0x1f, 0xcd, 0xdb, 0xe1, 0xf7, 0x95, 0x83, 0xb9, 0xaf, 0xfa, 0xec, 0xd6, 0xc0, 0xa2, 0xb4, 0x8e, 0x98, 0x4a, 0x5c, 0x66, 0x70, 0x12, 0x4, 0x3e, 0x28, 0x87, 0x91, 0xab, 0xbd, 0xdf, 0xc9, 0xf3, 0xe5, 0x37, 0x21, 0x1b, 0xd, 0x6f, 0x79, 0x43, 0x55, 0xe9, 0xff, 0xc5, 0xd3, 0xb1, 0xa7, 0x9d, 0x8b, 0x59, 0x4f, 0x75, 0x63, 0x1, 0x17, 0x2d, 0x3b, 0x94, 0x82, 0xb8, 0xae, 0xcc, 0xda, 0xe0, 0xf6, 0x24, 0x32, 0x8, 0x1e, 0x7c, 0x6a, 0x50, 0x46, 0x13, 0x5, 0x3f, 0x29, 0x4b, 0x5d, 0x67, 0x71, 0xa3, 0xb5, 0x8f, 0x99, 0xfb, 0xed, 0xd7, 0xc1, 0x6e, 0x78, 0x42, 0x54, 0x36, 0x20, 0x1a, 0xc, 0xde, 0xc8, 0xf2, 0xe4, 0x86, 0x90, 0xaa, 0xbc, 0xcf, 0xd9, 0xe3, 0xf5, 0x97, 0x81, 0xbb, 0xad, 0x7f, 0x69, 0x53, 0x45, 0x27, 0x31, 0xb, 0x1d, 0xb2, 0xa4, 0x9e, 0x88, 0xea, 0xfc, 0xc6, 0xd0, 0x2, 0x14, 0x2e, 0x38, 0x5a, 0x4c, 0x76, 0x60, 0x35, 0x23, 0x19, 0xf, 0x6d, 0x7b, 0x41, 0x57, 0x85, 0x93, 0xa9, 0xbf, 0xdd, 0xcb, 0xf1, 0xe7, 0x48, 0x5e, 0x64, 0x72, 0x10, 0x6, 0x3c, 0x2a, 0xf8, 0xee, 0xd4, 0xc2, 0xa0, 0xb6, 0x8c, 0x9a, 0x26, 0x30, 0xa, 0x1c, 0x7e, 0x68, 0x52, 0x44, 0x96, 0x80, 0xba, 0xac, 0xce, 0xd8, 0xe2, 0xf4, 0x5b, 0x4d, 0x77, 0x61, 0x3, 0x15, 0x2f, 0x39, 0xeb, 0xfd, 0xc7, 0xd1, 0xb3, 0xa5, 0x9f, 0x89, 0xdc, 0xca, 0xf0, 0xe6, 0x84, 0x92, 0xa8, 0xbe, 0x6c, 0x7a, 0x40, 0x56, 0x34, 0x22, 0x18, 0xe, 0xa1, 0xb7, 0x8d, 0x9b, 0xf9, 0xef, 0xd5, 0xc3, 0x11, 0x7, 0x3d, 0x2b, 0x49, 0x5f, 0x65, 0x73}, + {0x0, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd, 0x6d, 0x7a, 0x43, 0x54, 0x31, 0x26, 0x1f, 0x8, 0xd5, 0xc2, 0xfb, 0xec, 0x89, 0x9e, 0xa7, 0xb0, 0xda, 0xcd, 0xf4, 0xe3, 0x86, 0x91, 0xa8, 0xbf, 0x62, 0x75, 0x4c, 0x5b, 0x3e, 0x29, 0x10, 0x7, 0xb7, 0xa0, 0x99, 0x8e, 0xeb, 0xfc, 0xc5, 0xd2, 0xf, 0x18, 0x21, 0x36, 0x53, 0x44, 0x7d, 0x6a, 0xa9, 0xbe, 0x87, 0x90, 0xf5, 0xe2, 0xdb, 0xcc, 0x11, 0x6, 0x3f, 0x28, 0x4d, 0x5a, 0x63, 0x74, 0xc4, 0xd3, 0xea, 0xfd, 0x98, 0x8f, 0xb6, 0xa1, 0x7c, 0x6b, 0x52, 0x45, 0x20, 0x37, 0xe, 0x19, 0x73, 0x64, 0x5d, 0x4a, 0x2f, 0x38, 0x1, 0x16, 0xcb, 0xdc, 0xe5, 0xf2, 0x97, 0x80, 0xb9, 0xae, 0x1e, 0x9, 0x30, 0x27, 0x42, 0x55, 0x6c, 0x7b, 0xa6, 0xb1, 0x88, 0x9f, 0xfa, 0xed, 0xd4, 0xc3, 0x4f, 0x58, 0x61, 0x76, 0x13, 0x4, 0x3d, 0x2a, 0xf7, 0xe0, 0xd9, 0xce, 0xab, 0xbc, 0x85, 0x92, 0x22, 0x35, 0xc, 0x1b, 0x7e, 0x69, 0x50, 0x47, 0x9a, 0x8d, 0xb4, 0xa3, 0xc6, 0xd1, 0xe8, 0xff, 0x95, 0x82, 0xbb, 0xac, 0xc9, 0xde, 0xe7, 0xf0, 0x2d, 0x3a, 0x3, 0x14, 0x71, 0x66, 0x5f, 0x48, 0xf8, 0xef, 0xd6, 0xc1, 0xa4, 0xb3, 0x8a, 0x9d, 0x40, 0x57, 0x6e, 0x79, 0x1c, 0xb, 0x32, 0x25, 0xe6, 0xf1, 0xc8, 0xdf, 0xba, 0xad, 0x94, 0x83, 0x5e, 0x49, 0x70, 0x67, 0x2, 0x15, 0x2c, 0x3b, 0x8b, 0x9c, 0xa5, 0xb2, 0xd7, 0xc0, 0xf9, 0xee, 0x33, 0x24, 0x1d, 0xa, 0x6f, 0x78, 0x41, 0x56, 0x3c, 0x2b, 0x12, 0x5, 0x60, 0x77, 0x4e, 0x59, 0x84, 0x93, 0xaa, 0xbd, 0xd8, 0xcf, 0xf6, 0xe1, 0x51, 0x46, 0x7f, 0x68, 0xd, 0x1a, 0x23, 0x34, 0xe9, 0xfe, 0xc7, 0xd0, 0xb5, 0xa2, 0x9b, 0x8c}, + {0x0, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88, 0x9d, 0x85, 0xad, 0xb5, 0xfd, 0xe5, 0xcd, 0xd5, 0x5d, 0x45, 0x6d, 0x75, 0x3d, 0x25, 0xd, 0x15, 0x27, 0x3f, 0x17, 0xf, 0x47, 0x5f, 0x77, 0x6f, 0xe7, 0xff, 0xd7, 0xcf, 0x87, 0x9f, 0xb7, 0xaf, 0xba, 0xa2, 0x8a, 0x92, 0xda, 0xc2, 0xea, 0xf2, 0x7a, 0x62, 0x4a, 0x52, 0x1a, 0x2, 0x2a, 0x32, 0x4e, 0x56, 0x7e, 0x66, 0x2e, 0x36, 0x1e, 0x6, 0x8e, 0x96, 0xbe, 0xa6, 0xee, 0xf6, 0xde, 0xc6, 0xd3, 0xcb, 0xe3, 0xfb, 0xb3, 0xab, 0x83, 0x9b, 0x13, 0xb, 0x23, 0x3b, 0x73, 0x6b, 0x43, 0x5b, 0x69, 0x71, 0x59, 0x41, 0x9, 0x11, 0x39, 0x21, 0xa9, 0xb1, 0x99, 0x81, 0xc9, 0xd1, 0xf9, 0xe1, 0xf4, 0xec, 0xc4, 0xdc, 0x94, 0x8c, 0xa4, 0xbc, 0x34, 0x2c, 0x4, 0x1c, 0x54, 0x4c, 0x64, 0x7c, 0x9c, 0x84, 0xac, 0xb4, 0xfc, 0xe4, 0xcc, 0xd4, 0x5c, 0x44, 0x6c, 0x74, 0x3c, 0x24, 0xc, 0x14, 0x1, 0x19, 0x31, 0x29, 0x61, 0x79, 0x51, 0x49, 0xc1, 0xd9, 0xf1, 0xe9, 0xa1, 0xb9, 0x91, 0x89, 0xbb, 0xa3, 0x8b, 0x93, 0xdb, 0xc3, 0xeb, 0xf3, 0x7b, 0x63, 0x4b, 0x53, 0x1b, 0x3, 0x2b, 0x33, 0x26, 0x3e, 0x16, 0xe, 0x46, 0x5e, 0x76, 0x6e, 0xe6, 0xfe, 0xd6, 0xce, 0x86, 0x9e, 0xb6, 0xae, 0xd2, 0xca, 0xe2, 0xfa, 0xb2, 0xaa, 0x82, 0x9a, 0x12, 0xa, 0x22, 0x3a, 0x72, 0x6a, 0x42, 0x5a, 0x4f, 0x57, 0x7f, 0x67, 0x2f, 0x37, 0x1f, 0x7, 0x8f, 0x97, 0xbf, 0xa7, 0xef, 0xf7, 0xdf, 0xc7, 0xf5, 0xed, 0xc5, 0xdd, 0x95, 0x8d, 0xa5, 0xbd, 0x35, 0x2d, 0x5, 0x1d, 0x55, 0x4d, 0x65, 0x7d, 0x68, 0x70, 0x58, 0x40, 0x8, 0x10, 0x38, 0x20, 0xa8, 0xb0, 0x98, 0x80, 0xc8, 0xd0, 0xf8, 0xe0}, + {0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87, 0x8d, 0x94, 0xbf, 0xa6, 0xe9, 0xf0, 0xdb, 0xc2, 0x45, 0x5c, 0x77, 0x6e, 0x21, 0x38, 0x13, 0xa, 0x7, 0x1e, 0x35, 0x2c, 0x63, 0x7a, 0x51, 0x48, 0xcf, 0xd6, 0xfd, 0xe4, 0xab, 0xb2, 0x99, 0x80, 0x8a, 0x93, 0xb8, 0xa1, 0xee, 0xf7, 0xdc, 0xc5, 0x42, 0x5b, 0x70, 0x69, 0x26, 0x3f, 0x14, 0xd, 0xe, 0x17, 0x3c, 0x25, 0x6a, 0x73, 0x58, 0x41, 0xc6, 0xdf, 0xf4, 0xed, 0xa2, 0xbb, 0x90, 0x89, 0x83, 0x9a, 0xb1, 0xa8, 0xe7, 0xfe, 0xd5, 0xcc, 0x4b, 0x52, 0x79, 0x60, 0x2f, 0x36, 0x1d, 0x4, 0x9, 0x10, 0x3b, 0x22, 0x6d, 0x74, 0x5f, 0x46, 0xc1, 0xd8, 0xf3, 0xea, 0xa5, 0xbc, 0x97, 0x8e, 0x84, 0x9d, 0xb6, 0xaf, 0xe0, 0xf9, 0xd2, 0xcb, 0x4c, 0x55, 0x7e, 0x67, 0x28, 0x31, 0x1a, 0x3, 0x1c, 0x5, 0x2e, 0x37, 0x78, 0x61, 0x4a, 0x53, 0xd4, 0xcd, 0xe6, 0xff, 0xb0, 0xa9, 0x82, 0x9b, 0x91, 0x88, 0xa3, 0xba, 0xf5, 0xec, 0xc7, 0xde, 0x59, 0x40, 0x6b, 0x72, 0x3d, 0x24, 0xf, 0x16, 0x1b, 0x2, 0x29, 0x30, 0x7f, 0x66, 0x4d, 0x54, 0xd3, 0xca, 0xe1, 0xf8, 0xb7, 0xae, 0x85, 0x9c, 0x96, 0x8f, 0xa4, 0xbd, 0xf2, 0xeb, 0xc0, 0xd9, 0x5e, 0x47, 0x6c, 0x75, 0x3a, 0x23, 0x8, 0x11, 0x12, 0xb, 0x20, 0x39, 0x76, 0x6f, 0x44, 0x5d, 0xda, 0xc3, 0xe8, 0xf1, 0xbe, 0xa7, 0x8c, 0x95, 0x9f, 0x86, 0xad, 0xb4, 0xfb, 0xe2, 0xc9, 0xd0, 0x57, 0x4e, 0x65, 0x7c, 0x33, 0x2a, 0x1, 0x18, 0x15, 0xc, 0x27, 0x3e, 0x71, 0x68, 0x43, 0x5a, 0xdd, 0xc4, 0xef, 0xf6, 0xb9, 0xa0, 0x8b, 0x92, 0x98, 0x81, 0xaa, 0xb3, 0xfc, 0xe5, 0xce, 0xd7, 0x50, 0x49, 0x62, 0x7b, 0x34, 0x2d, 0x6, 0x1f}, + {0x0, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96, 0xbd, 0xa7, 0x89, 0x93, 0xd5, 0xcf, 0xe1, 0xfb, 0x6d, 0x77, 0x59, 0x43, 0x5, 0x1f, 0x31, 0x2b, 0x67, 0x7d, 0x53, 0x49, 0xf, 0x15, 0x3b, 0x21, 0xb7, 0xad, 0x83, 0x99, 0xdf, 0xc5, 0xeb, 0xf1, 0xda, 0xc0, 0xee, 0xf4, 0xb2, 0xa8, 0x86, 0x9c, 0xa, 0x10, 0x3e, 0x24, 0x62, 0x78, 0x56, 0x4c, 0xce, 0xd4, 0xfa, 0xe0, 0xa6, 0xbc, 0x92, 0x88, 0x1e, 0x4, 0x2a, 0x30, 0x76, 0x6c, 0x42, 0x58, 0x73, 0x69, 0x47, 0x5d, 0x1b, 0x1, 0x2f, 0x35, 0xa3, 0xb9, 0x97, 0x8d, 0xcb, 0xd1, 0xff, 0xe5, 0xa9, 0xb3, 0x9d, 0x87, 0xc1, 0xdb, 0xf5, 0xef, 0x79, 0x63, 0x4d, 0x57, 0x11, 0xb, 0x25, 0x3f, 0x14, 0xe, 0x20, 0x3a, 0x7c, 0x66, 0x48, 0x52, 0xc4, 0xde, 0xf0, 0xea, 0xac, 0xb6, 0x98, 0x82, 0x81, 0x9b, 0xb5, 0xaf, 0xe9, 0xf3, 0xdd, 0xc7, 0x51, 0x4b, 0x65, 0x7f, 0x39, 0x23, 0xd, 0x17, 0x3c, 0x26, 0x8, 0x12, 0x54, 0x4e, 0x60, 0x7a, 0xec, 0xf6, 0xd8, 0xc2, 0x84, 0x9e, 0xb0, 0xaa, 0xe6, 0xfc, 0xd2, 0xc8, 0x8e, 0x94, 0xba, 0xa0, 0x36, 0x2c, 0x2, 0x18, 0x5e, 0x44, 0x6a, 0x70, 0x5b, 0x41, 0x6f, 0x75, 0x33, 0x29, 0x7, 0x1d, 0x8b, 0x91, 0xbf, 0xa5, 0xe3, 0xf9, 0xd7, 0xcd, 0x4f, 0x55, 0x7b, 0x61, 0x27, 0x3d, 0x13, 0x9, 0x9f, 0x85, 0xab, 0xb1, 0xf7, 0xed, 0xc3, 0xd9, 0xf2, 0xe8, 0xc6, 0xdc, 0x9a, 0x80, 0xae, 0xb4, 0x22, 0x38, 0x16, 0xc, 0x4a, 0x50, 0x7e, 0x64, 0x28, 0x32, 0x1c, 0x6, 0x40, 0x5a, 0x74, 0x6e, 0xf8, 0xe2, 0xcc, 0xd6, 0x90, 0x8a, 0xa4, 0xbe, 0x95, 0x8f, 0xa1, 0xbb, 0xfd, 0xe7, 0xc9, 0xd3, 0x45, 0x5f, 0x71, 0x6b, 0x2d, 0x37, 0x19, 0x3}, + {0x0, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99, 0xad, 0xb6, 0x9b, 0x80, 0xc1, 0xda, 0xf7, 0xec, 0x75, 0x6e, 0x43, 0x58, 0x19, 0x2, 0x2f, 0x34, 0x47, 0x5c, 0x71, 0x6a, 0x2b, 0x30, 0x1d, 0x6, 0x9f, 0x84, 0xa9, 0xb2, 0xf3, 0xe8, 0xc5, 0xde, 0xea, 0xf1, 0xdc, 0xc7, 0x86, 0x9d, 0xb0, 0xab, 0x32, 0x29, 0x4, 0x1f, 0x5e, 0x45, 0x68, 0x73, 0x8e, 0x95, 0xb8, 0xa3, 0xe2, 0xf9, 0xd4, 0xcf, 0x56, 0x4d, 0x60, 0x7b, 0x3a, 0x21, 0xc, 0x17, 0x23, 0x38, 0x15, 0xe, 0x4f, 0x54, 0x79, 0x62, 0xfb, 0xe0, 0xcd, 0xd6, 0x97, 0x8c, 0xa1, 0xba, 0xc9, 0xd2, 0xff, 0xe4, 0xa5, 0xbe, 0x93, 0x88, 0x11, 0xa, 0x27, 0x3c, 0x7d, 0x66, 0x4b, 0x50, 0x64, 0x7f, 0x52, 0x49, 0x8, 0x13, 0x3e, 0x25, 0xbc, 0xa7, 0x8a, 0x91, 0xd0, 0xcb, 0xe6, 0xfd, 0x1, 0x1a, 0x37, 0x2c, 0x6d, 0x76, 0x5b, 0x40, 0xd9, 0xc2, 0xef, 0xf4, 0xb5, 0xae, 0x83, 0x98, 0xac, 0xb7, 0x9a, 0x81, 0xc0, 0xdb, 0xf6, 0xed, 0x74, 0x6f, 0x42, 0x59, 0x18, 0x3, 0x2e, 0x35, 0x46, 0x5d, 0x70, 0x6b, 0x2a, 0x31, 0x1c, 0x7, 0x9e, 0x85, 0xa8, 0xb3, 0xf2, 0xe9, 0xc4, 0xdf, 0xeb, 0xf0, 0xdd, 0xc6, 0x87, 0x9c, 0xb1, 0xaa, 0x33, 0x28, 0x5, 0x1e, 0x5f, 0x44, 0x69, 0x72, 0x8f, 0x94, 0xb9, 0xa2, 0xe3, 0xf8, 0xd5, 0xce, 0x57, 0x4c, 0x61, 0x7a, 0x3b, 0x20, 0xd, 0x16, 0x22, 0x39, 0x14, 0xf, 0x4e, 0x55, 0x78, 0x63, 0xfa, 0xe1, 0xcc, 0xd7, 0x96, 0x8d, 0xa0, 0xbb, 0xc8, 0xd3, 0xfe, 0xe5, 0xa4, 0xbf, 0x92, 0x89, 0x10, 0xb, 0x26, 0x3d, 0x7c, 0x67, 0x4a, 0x51, 0x65, 0x7e, 0x53, 0x48, 0x9, 0x12, 0x3f, 0x24, 0xbd, 0xa6, 0x8b, 0x90, 0xd1, 0xca, 0xe7, 0xfc}, + {0x0, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4, 0xdd, 0xc1, 0xe5, 0xf9, 0xad, 0xb1, 0x95, 0x89, 0x3d, 0x21, 0x5, 0x19, 0x4d, 0x51, 0x75, 0x69, 0xa7, 0xbb, 0x9f, 0x83, 0xd7, 0xcb, 0xef, 0xf3, 0x47, 0x5b, 0x7f, 0x63, 0x37, 0x2b, 0xf, 0x13, 0x7a, 0x66, 0x42, 0x5e, 0xa, 0x16, 0x32, 0x2e, 0x9a, 0x86, 0xa2, 0xbe, 0xea, 0xf6, 0xd2, 0xce, 0x53, 0x4f, 0x6b, 0x77, 0x23, 0x3f, 0x1b, 0x7, 0xb3, 0xaf, 0x8b, 0x97, 0xc3, 0xdf, 0xfb, 0xe7, 0x8e, 0x92, 0xb6, 0xaa, 0xfe, 0xe2, 0xc6, 0xda, 0x6e, 0x72, 0x56, 0x4a, 0x1e, 0x2, 0x26, 0x3a, 0xf4, 0xe8, 0xcc, 0xd0, 0x84, 0x98, 0xbc, 0xa0, 0x14, 0x8, 0x2c, 0x30, 0x64, 0x78, 0x5c, 0x40, 0x29, 0x35, 0x11, 0xd, 0x59, 0x45, 0x61, 0x7d, 0xc9, 0xd5, 0xf1, 0xed, 0xb9, 0xa5, 0x81, 0x9d, 0xa6, 0xba, 0x9e, 0x82, 0xd6, 0xca, 0xee, 0xf2, 0x46, 0x5a, 0x7e, 0x62, 0x36, 0x2a, 0xe, 0x12, 0x7b, 0x67, 0x43, 0x5f, 0xb, 0x17, 0x33, 0x2f, 0x9b, 0x87, 0xa3, 0xbf, 0xeb, 0xf7, 0xd3, 0xcf, 0x1, 0x1d, 0x39, 0x25, 0x71, 0x6d, 0x49, 0x55, 0xe1, 0xfd, 0xd9, 0xc5, 0x91, 0x8d, 0xa9, 0xb5, 0xdc, 0xc0, 0xe4, 0xf8, 0xac, 0xb0, 0x94, 0x88, 0x3c, 0x20, 0x4, 0x18, 0x4c, 0x50, 0x74, 0x68, 0xf5, 0xe9, 0xcd, 0xd1, 0x85, 0x99, 0xbd, 0xa1, 0x15, 0x9, 0x2d, 0x31, 0x65, 0x79, 0x5d, 0x41, 0x28, 0x34, 0x10, 0xc, 0x58, 0x44, 0x60, 0x7c, 0xc8, 0xd4, 0xf0, 0xec, 0xb8, 0xa4, 0x80, 0x9c, 0x52, 0x4e, 0x6a, 0x76, 0x22, 0x3e, 0x1a, 0x6, 0xb2, 0xae, 0x8a, 0x96, 0xc2, 0xde, 0xfa, 0xe6, 0x8f, 0x93, 0xb7, 0xab, 0xff, 0xe3, 0xc7, 0xdb, 0x6f, 0x73, 0x57, 0x4b, 0x1f, 0x3, 0x27, 0x3b}, + {0x0, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb, 0xcd, 0xd0, 0xf7, 0xea, 0xb9, 0xa4, 0x83, 0x9e, 0x25, 0x38, 0x1f, 0x2, 0x51, 0x4c, 0x6b, 0x76, 0x87, 0x9a, 0xbd, 0xa0, 0xf3, 0xee, 0xc9, 0xd4, 0x6f, 0x72, 0x55, 0x48, 0x1b, 0x6, 0x21, 0x3c, 0x4a, 0x57, 0x70, 0x6d, 0x3e, 0x23, 0x4, 0x19, 0xa2, 0xbf, 0x98, 0x85, 0xd6, 0xcb, 0xec, 0xf1, 0x13, 0xe, 0x29, 0x34, 0x67, 0x7a, 0x5d, 0x40, 0xfb, 0xe6, 0xc1, 0xdc, 0x8f, 0x92, 0xb5, 0xa8, 0xde, 0xc3, 0xe4, 0xf9, 0xaa, 0xb7, 0x90, 0x8d, 0x36, 0x2b, 0xc, 0x11, 0x42, 0x5f, 0x78, 0x65, 0x94, 0x89, 0xae, 0xb3, 0xe0, 0xfd, 0xda, 0xc7, 0x7c, 0x61, 0x46, 0x5b, 0x8, 0x15, 0x32, 0x2f, 0x59, 0x44, 0x63, 0x7e, 0x2d, 0x30, 0x17, 0xa, 0xb1, 0xac, 0x8b, 0x96, 0xc5, 0xd8, 0xff, 0xe2, 0x26, 0x3b, 0x1c, 0x1, 0x52, 0x4f, 0x68, 0x75, 0xce, 0xd3, 0xf4, 0xe9, 0xba, 0xa7, 0x80, 0x9d, 0xeb, 0xf6, 0xd1, 0xcc, 0x9f, 0x82, 0xa5, 0xb8, 0x3, 0x1e, 0x39, 0x24, 0x77, 0x6a, 0x4d, 0x50, 0xa1, 0xbc, 0x9b, 0x86, 0xd5, 0xc8, 0xef, 0xf2, 0x49, 0x54, 0x73, 0x6e, 0x3d, 0x20, 0x7, 0x1a, 0x6c, 0x71, 0x56, 0x4b, 0x18, 0x5, 0x22, 0x3f, 0x84, 0x99, 0xbe, 0xa3, 0xf0, 0xed, 0xca, 0xd7, 0x35, 0x28, 0xf, 0x12, 0x41, 0x5c, 0x7b, 0x66, 0xdd, 0xc0, 0xe7, 0xfa, 0xa9, 0xb4, 0x93, 0x8e, 0xf8, 0xe5, 0xc2, 0xdf, 0x8c, 0x91, 0xb6, 0xab, 0x10, 0xd, 0x2a, 0x37, 0x64, 0x79, 0x5e, 0x43, 0xb2, 0xaf, 0x88, 0x95, 0xc6, 0xdb, 0xfc, 0xe1, 0x5a, 0x47, 0x60, 0x7d, 0x2e, 0x33, 0x14, 0x9, 0x7f, 0x62, 0x45, 0x58, 0xb, 0x16, 0x31, 0x2c, 0x97, 0x8a, 0xad, 0xb0, 0xe3, 0xfe, 0xd9, 0xc4}, + {0x0, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa, 0xfd, 0xe3, 0xc1, 0xdf, 0x85, 0x9b, 0xb9, 0xa7, 0xd, 0x13, 0x31, 0x2f, 0x75, 0x6b, 0x49, 0x57, 0xe7, 0xf9, 0xdb, 0xc5, 0x9f, 0x81, 0xa3, 0xbd, 0x17, 0x9, 0x2b, 0x35, 0x6f, 0x71, 0x53, 0x4d, 0x1a, 0x4, 0x26, 0x38, 0x62, 0x7c, 0x5e, 0x40, 0xea, 0xf4, 0xd6, 0xc8, 0x92, 0x8c, 0xae, 0xb0, 0xd3, 0xcd, 0xef, 0xf1, 0xab, 0xb5, 0x97, 0x89, 0x23, 0x3d, 0x1f, 0x1, 0x5b, 0x45, 0x67, 0x79, 0x2e, 0x30, 0x12, 0xc, 0x56, 0x48, 0x6a, 0x74, 0xde, 0xc0, 0xe2, 0xfc, 0xa6, 0xb8, 0x9a, 0x84, 0x34, 0x2a, 0x8, 0x16, 0x4c, 0x52, 0x70, 0x6e, 0xc4, 0xda, 0xf8, 0xe6, 0xbc, 0xa2, 0x80, 0x9e, 0xc9, 0xd7, 0xf5, 0xeb, 0xb1, 0xaf, 0x8d, 0x93, 0x39, 0x27, 0x5, 0x1b, 0x41, 0x5f, 0x7d, 0x63, 0xbb, 0xa5, 0x87, 0x99, 0xc3, 0xdd, 0xff, 0xe1, 0x4b, 0x55, 0x77, 0x69, 0x33, 0x2d, 0xf, 0x11, 0x46, 0x58, 0x7a, 0x64, 0x3e, 0x20, 0x2, 0x1c, 0xb6, 0xa8, 0x8a, 0x94, 0xce, 0xd0, 0xf2, 0xec, 0x5c, 0x42, 0x60, 0x7e, 0x24, 0x3a, 0x18, 0x6, 0xac, 0xb2, 0x90, 0x8e, 0xd4, 0xca, 0xe8, 0xf6, 0xa1, 0xbf, 0x9d, 0x83, 0xd9, 0xc7, 0xe5, 0xfb, 0x51, 0x4f, 0x6d, 0x73, 0x29, 0x37, 0x15, 0xb, 0x68, 0x76, 0x54, 0x4a, 0x10, 0xe, 0x2c, 0x32, 0x98, 0x86, 0xa4, 0xba, 0xe0, 0xfe, 0xdc, 0xc2, 0x95, 0x8b, 0xa9, 0xb7, 0xed, 0xf3, 0xd1, 0xcf, 0x65, 0x7b, 0x59, 0x47, 0x1d, 0x3, 0x21, 0x3f, 0x8f, 0x91, 0xb3, 0xad, 0xf7, 0xe9, 0xcb, 0xd5, 0x7f, 0x61, 0x43, 0x5d, 0x7, 0x19, 0x3b, 0x25, 0x72, 0x6c, 0x4e, 0x50, 0xa, 0x14, 0x36, 0x28, 0x82, 0x9c, 0xbe, 0xa0, 0xfa, 0xe4, 0xc6, 0xd8}, + {0x0, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5, 0xed, 0xf2, 0xd3, 0xcc, 0x91, 0x8e, 0xaf, 0xb0, 0x15, 0xa, 0x2b, 0x34, 0x69, 0x76, 0x57, 0x48, 0xc7, 0xd8, 0xf9, 0xe6, 0xbb, 0xa4, 0x85, 0x9a, 0x3f, 0x20, 0x1, 0x1e, 0x43, 0x5c, 0x7d, 0x62, 0x2a, 0x35, 0x14, 0xb, 0x56, 0x49, 0x68, 0x77, 0xd2, 0xcd, 0xec, 0xf3, 0xae, 0xb1, 0x90, 0x8f, 0x93, 0x8c, 0xad, 0xb2, 0xef, 0xf0, 0xd1, 0xce, 0x6b, 0x74, 0x55, 0x4a, 0x17, 0x8, 0x29, 0x36, 0x7e, 0x61, 0x40, 0x5f, 0x2, 0x1d, 0x3c, 0x23, 0x86, 0x99, 0xb8, 0xa7, 0xfa, 0xe5, 0xc4, 0xdb, 0x54, 0x4b, 0x6a, 0x75, 0x28, 0x37, 0x16, 0x9, 0xac, 0xb3, 0x92, 0x8d, 0xd0, 0xcf, 0xee, 0xf1, 0xb9, 0xa6, 0x87, 0x98, 0xc5, 0xda, 0xfb, 0xe4, 0x41, 0x5e, 0x7f, 0x60, 0x3d, 0x22, 0x3, 0x1c, 0x3b, 0x24, 0x5, 0x1a, 0x47, 0x58, 0x79, 0x66, 0xc3, 0xdc, 0xfd, 0xe2, 0xbf, 0xa0, 0x81, 0x9e, 0xd6, 0xc9, 0xe8, 0xf7, 0xaa, 0xb5, 0x94, 0x8b, 0x2e, 0x31, 0x10, 0xf, 0x52, 0x4d, 0x6c, 0x73, 0xfc, 0xe3, 0xc2, 0xdd, 0x80, 0x9f, 0xbe, 0xa1, 0x4, 0x1b, 0x3a, 0x25, 0x78, 0x67, 0x46, 0x59, 0x11, 0xe, 0x2f, 0x30, 0x6d, 0x72, 0x53, 0x4c, 0xe9, 0xf6, 0xd7, 0xc8, 0x95, 0x8a, 0xab, 0xb4, 0xa8, 0xb7, 0x96, 0x89, 0xd4, 0xcb, 0xea, 0xf5, 0x50, 0x4f, 0x6e, 0x71, 0x2c, 0x33, 0x12, 0xd, 0x45, 0x5a, 0x7b, 0x64, 0x39, 0x26, 0x7, 0x18, 0xbd, 0xa2, 0x83, 0x9c, 0xc1, 0xde, 0xff, 0xe0, 0x6f, 0x70, 0x51, 0x4e, 0x13, 0xc, 0x2d, 0x32, 0x97, 0x88, 0xa9, 0xb6, 0xeb, 0xf4, 0xd5, 0xca, 0x82, 0x9d, 0xbc, 0xa3, 0xfe, 0xe1, 0xc0, 0xdf, 0x7a, 0x65, 0x44, 0x5b, 0x6, 0x19, 0x38, 0x27}, + {0x0, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd, 0x3a, 0x1a, 0x7a, 0x5a, 0xba, 0x9a, 0xfa, 0xda, 0x27, 0x7, 0x67, 0x47, 0xa7, 0x87, 0xe7, 0xc7, 0x74, 0x54, 0x34, 0x14, 0xf4, 0xd4, 0xb4, 0x94, 0x69, 0x49, 0x29, 0x9, 0xe9, 0xc9, 0xa9, 0x89, 0x4e, 0x6e, 0xe, 0x2e, 0xce, 0xee, 0x8e, 0xae, 0x53, 0x73, 0x13, 0x33, 0xd3, 0xf3, 0x93, 0xb3, 0xe8, 0xc8, 0xa8, 0x88, 0x68, 0x48, 0x28, 0x8, 0xf5, 0xd5, 0xb5, 0x95, 0x75, 0x55, 0x35, 0x15, 0xd2, 0xf2, 0x92, 0xb2, 0x52, 0x72, 0x12, 0x32, 0xcf, 0xef, 0x8f, 0xaf, 0x4f, 0x6f, 0xf, 0x2f, 0x9c, 0xbc, 0xdc, 0xfc, 0x1c, 0x3c, 0x5c, 0x7c, 0x81, 0xa1, 0xc1, 0xe1, 0x1, 0x21, 0x41, 0x61, 0xa6, 0x86, 0xe6, 0xc6, 0x26, 0x6, 0x66, 0x46, 0xbb, 0x9b, 0xfb, 0xdb, 0x3b, 0x1b, 0x7b, 0x5b, 0xcd, 0xed, 0x8d, 0xad, 0x4d, 0x6d, 0xd, 0x2d, 0xd0, 0xf0, 0x90, 0xb0, 0x50, 0x70, 0x10, 0x30, 0xf7, 0xd7, 0xb7, 0x97, 0x77, 0x57, 0x37, 0x17, 0xea, 0xca, 0xaa, 0x8a, 0x6a, 0x4a, 0x2a, 0xa, 0xb9, 0x99, 0xf9, 0xd9, 0x39, 0x19, 0x79, 0x59, 0xa4, 0x84, 0xe4, 0xc4, 0x24, 0x4, 0x64, 0x44, 0x83, 0xa3, 0xc3, 0xe3, 0x3, 0x23, 0x43, 0x63, 0x9e, 0xbe, 0xde, 0xfe, 0x1e, 0x3e, 0x5e, 0x7e, 0x25, 0x5, 0x65, 0x45, 0xa5, 0x85, 0xe5, 0xc5, 0x38, 0x18, 0x78, 0x58, 0xb8, 0x98, 0xf8, 0xd8, 0x1f, 0x3f, 0x5f, 0x7f, 0x9f, 0xbf, 0xdf, 0xff, 0x2, 0x22, 0x42, 0x62, 0x82, 0xa2, 0xc2, 0xe2, 0x51, 0x71, 0x11, 0x31, 0xd1, 0xf1, 0x91, 0xb1, 0x4c, 0x6c, 0xc, 0x2c, 0xcc, 0xec, 0x8c, 0xac, 0x6b, 0x4b, 0x2b, 0xb, 0xeb, 0xcb, 0xab, 0x8b, 0x76, 0x56, 0x36, 0x16, 0xf6, 0xd6, 0xb6, 0x96}, + {0x0, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2, 0x2a, 0xb, 0x68, 0x49, 0xae, 0x8f, 0xec, 0xcd, 0x3f, 0x1e, 0x7d, 0x5c, 0xbb, 0x9a, 0xf9, 0xd8, 0x54, 0x75, 0x16, 0x37, 0xd0, 0xf1, 0x92, 0xb3, 0x41, 0x60, 0x3, 0x22, 0xc5, 0xe4, 0x87, 0xa6, 0x7e, 0x5f, 0x3c, 0x1d, 0xfa, 0xdb, 0xb8, 0x99, 0x6b, 0x4a, 0x29, 0x8, 0xef, 0xce, 0xad, 0x8c, 0xa8, 0x89, 0xea, 0xcb, 0x2c, 0xd, 0x6e, 0x4f, 0xbd, 0x9c, 0xff, 0xde, 0x39, 0x18, 0x7b, 0x5a, 0x82, 0xa3, 0xc0, 0xe1, 0x6, 0x27, 0x44, 0x65, 0x97, 0xb6, 0xd5, 0xf4, 0x13, 0x32, 0x51, 0x70, 0xfc, 0xdd, 0xbe, 0x9f, 0x78, 0x59, 0x3a, 0x1b, 0xe9, 0xc8, 0xab, 0x8a, 0x6d, 0x4c, 0x2f, 0xe, 0xd6, 0xf7, 0x94, 0xb5, 0x52, 0x73, 0x10, 0x31, 0xc3, 0xe2, 0x81, 0xa0, 0x47, 0x66, 0x5, 0x24, 0x4d, 0x6c, 0xf, 0x2e, 0xc9, 0xe8, 0x8b, 0xaa, 0x58, 0x79, 0x1a, 0x3b, 0xdc, 0xfd, 0x9e, 0xbf, 0x67, 0x46, 0x25, 0x4, 0xe3, 0xc2, 0xa1, 0x80, 0x72, 0x53, 0x30, 0x11, 0xf6, 0xd7, 0xb4, 0x95, 0x19, 0x38, 0x5b, 0x7a, 0x9d, 0xbc, 0xdf, 0xfe, 0xc, 0x2d, 0x4e, 0x6f, 0x88, 0xa9, 0xca, 0xeb, 0x33, 0x12, 0x71, 0x50, 0xb7, 0x96, 0xf5, 0xd4, 0x26, 0x7, 0x64, 0x45, 0xa2, 0x83, 0xe0, 0xc1, 0xe5, 0xc4, 0xa7, 0x86, 0x61, 0x40, 0x23, 0x2, 0xf0, 0xd1, 0xb2, 0x93, 0x74, 0x55, 0x36, 0x17, 0xcf, 0xee, 0x8d, 0xac, 0x4b, 0x6a, 0x9, 0x28, 0xda, 0xfb, 0x98, 0xb9, 0x5e, 0x7f, 0x1c, 0x3d, 0xb1, 0x90, 0xf3, 0xd2, 0x35, 0x14, 0x77, 0x56, 0xa4, 0x85, 0xe6, 0xc7, 0x20, 0x1, 0x62, 0x43, 0x9b, 0xba, 0xd9, 0xf8, 0x1f, 0x3e, 0x5d, 0x7c, 0x8e, 0xaf, 0xcc, 0xed, 0xa, 0x2b, 0x48, 0x69}, + {0x0, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0xd, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3, 0x1a, 0x38, 0x5e, 0x7c, 0x92, 0xb0, 0xd6, 0xf4, 0x17, 0x35, 0x53, 0x71, 0x9f, 0xbd, 0xdb, 0xf9, 0x34, 0x16, 0x70, 0x52, 0xbc, 0x9e, 0xf8, 0xda, 0x39, 0x1b, 0x7d, 0x5f, 0xb1, 0x93, 0xf5, 0xd7, 0x2e, 0xc, 0x6a, 0x48, 0xa6, 0x84, 0xe2, 0xc0, 0x23, 0x1, 0x67, 0x45, 0xab, 0x89, 0xef, 0xcd, 0x68, 0x4a, 0x2c, 0xe, 0xe0, 0xc2, 0xa4, 0x86, 0x65, 0x47, 0x21, 0x3, 0xed, 0xcf, 0xa9, 0x8b, 0x72, 0x50, 0x36, 0x14, 0xfa, 0xd8, 0xbe, 0x9c, 0x7f, 0x5d, 0x3b, 0x19, 0xf7, 0xd5, 0xb3, 0x91, 0x5c, 0x7e, 0x18, 0x3a, 0xd4, 0xf6, 0x90, 0xb2, 0x51, 0x73, 0x15, 0x37, 0xd9, 0xfb, 0x9d, 0xbf, 0x46, 0x64, 0x2, 0x20, 0xce, 0xec, 0x8a, 0xa8, 0x4b, 0x69, 0xf, 0x2d, 0xc3, 0xe1, 0x87, 0xa5, 0xd0, 0xf2, 0x94, 0xb6, 0x58, 0x7a, 0x1c, 0x3e, 0xdd, 0xff, 0x99, 0xbb, 0x55, 0x77, 0x11, 0x33, 0xca, 0xe8, 0x8e, 0xac, 0x42, 0x60, 0x6, 0x24, 0xc7, 0xe5, 0x83, 0xa1, 0x4f, 0x6d, 0xb, 0x29, 0xe4, 0xc6, 0xa0, 0x82, 0x6c, 0x4e, 0x28, 0xa, 0xe9, 0xcb, 0xad, 0x8f, 0x61, 0x43, 0x25, 0x7, 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, 0xf3, 0xd1, 0xb7, 0x95, 0x7b, 0x59, 0x3f, 0x1d, 0xb8, 0x9a, 0xfc, 0xde, 0x30, 0x12, 0x74, 0x56, 0xb5, 0x97, 0xf1, 0xd3, 0x3d, 0x1f, 0x79, 0x5b, 0xa2, 0x80, 0xe6, 0xc4, 0x2a, 0x8, 0x6e, 0x4c, 0xaf, 0x8d, 0xeb, 0xc9, 0x27, 0x5, 0x63, 0x41, 0x8c, 0xae, 0xc8, 0xea, 0x4, 0x26, 0x40, 0x62, 0x81, 0xa3, 0xc5, 0xe7, 0x9, 0x2b, 0x4d, 0x6f, 0x96, 0xb4, 0xd2, 0xf0, 0x1e, 0x3c, 0x5a, 0x78, 0x9b, 0xb9, 0xdf, 0xfd, 0x13, 0x31, 0x57, 0x75}, + {0x0, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x5, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec, 0xa, 0x29, 0x4c, 0x6f, 0x86, 0xa5, 0xc0, 0xe3, 0xf, 0x2c, 0x49, 0x6a, 0x83, 0xa0, 0xc5, 0xe6, 0x14, 0x37, 0x52, 0x71, 0x98, 0xbb, 0xde, 0xfd, 0x11, 0x32, 0x57, 0x74, 0x9d, 0xbe, 0xdb, 0xf8, 0x1e, 0x3d, 0x58, 0x7b, 0x92, 0xb1, 0xd4, 0xf7, 0x1b, 0x38, 0x5d, 0x7e, 0x97, 0xb4, 0xd1, 0xf2, 0x28, 0xb, 0x6e, 0x4d, 0xa4, 0x87, 0xe2, 0xc1, 0x2d, 0xe, 0x6b, 0x48, 0xa1, 0x82, 0xe7, 0xc4, 0x22, 0x1, 0x64, 0x47, 0xae, 0x8d, 0xe8, 0xcb, 0x27, 0x4, 0x61, 0x42, 0xab, 0x88, 0xed, 0xce, 0x3c, 0x1f, 0x7a, 0x59, 0xb0, 0x93, 0xf6, 0xd5, 0x39, 0x1a, 0x7f, 0x5c, 0xb5, 0x96, 0xf3, 0xd0, 0x36, 0x15, 0x70, 0x53, 0xba, 0x99, 0xfc, 0xdf, 0x33, 0x10, 0x75, 0x56, 0xbf, 0x9c, 0xf9, 0xda, 0x50, 0x73, 0x16, 0x35, 0xdc, 0xff, 0x9a, 0xb9, 0x55, 0x76, 0x13, 0x30, 0xd9, 0xfa, 0x9f, 0xbc, 0x5a, 0x79, 0x1c, 0x3f, 0xd6, 0xf5, 0x90, 0xb3, 0x5f, 0x7c, 0x19, 0x3a, 0xd3, 0xf0, 0x95, 0xb6, 0x44, 0x67, 0x2, 0x21, 0xc8, 0xeb, 0x8e, 0xad, 0x41, 0x62, 0x7, 0x24, 0xcd, 0xee, 0x8b, 0xa8, 0x4e, 0x6d, 0x8, 0x2b, 0xc2, 0xe1, 0x84, 0xa7, 0x4b, 0x68, 0xd, 0x2e, 0xc7, 0xe4, 0x81, 0xa2, 0x78, 0x5b, 0x3e, 0x1d, 0xf4, 0xd7, 0xb2, 0x91, 0x7d, 0x5e, 0x3b, 0x18, 0xf1, 0xd2, 0xb7, 0x94, 0x72, 0x51, 0x34, 0x17, 0xfe, 0xdd, 0xb8, 0x9b, 0x77, 0x54, 0x31, 0x12, 0xfb, 0xd8, 0xbd, 0x9e, 0x6c, 0x4f, 0x2a, 0x9, 0xe0, 0xc3, 0xa6, 0x85, 0x69, 0x4a, 0x2f, 0xc, 0xe5, 0xc6, 0xa3, 0x80, 0x66, 0x45, 0x20, 0x3, 0xea, 0xc9, 0xac, 0x8f, 0x63, 0x40, 0x25, 0x6, 0xef, 0xcc, 0xa9, 0x8a}, + {0x0, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1, 0x7a, 0x5e, 0x32, 0x16, 0xea, 0xce, 0xa2, 0x86, 0x47, 0x63, 0xf, 0x2b, 0xd7, 0xf3, 0x9f, 0xbb, 0xf4, 0xd0, 0xbc, 0x98, 0x64, 0x40, 0x2c, 0x8, 0xc9, 0xed, 0x81, 0xa5, 0x59, 0x7d, 0x11, 0x35, 0x8e, 0xaa, 0xc6, 0xe2, 0x1e, 0x3a, 0x56, 0x72, 0xb3, 0x97, 0xfb, 0xdf, 0x23, 0x7, 0x6b, 0x4f, 0xf5, 0xd1, 0xbd, 0x99, 0x65, 0x41, 0x2d, 0x9, 0xc8, 0xec, 0x80, 0xa4, 0x58, 0x7c, 0x10, 0x34, 0x8f, 0xab, 0xc7, 0xe3, 0x1f, 0x3b, 0x57, 0x73, 0xb2, 0x96, 0xfa, 0xde, 0x22, 0x6, 0x6a, 0x4e, 0x1, 0x25, 0x49, 0x6d, 0x91, 0xb5, 0xd9, 0xfd, 0x3c, 0x18, 0x74, 0x50, 0xac, 0x88, 0xe4, 0xc0, 0x7b, 0x5f, 0x33, 0x17, 0xeb, 0xcf, 0xa3, 0x87, 0x46, 0x62, 0xe, 0x2a, 0xd6, 0xf2, 0x9e, 0xba, 0xf7, 0xd3, 0xbf, 0x9b, 0x67, 0x43, 0x2f, 0xb, 0xca, 0xee, 0x82, 0xa6, 0x5a, 0x7e, 0x12, 0x36, 0x8d, 0xa9, 0xc5, 0xe1, 0x1d, 0x39, 0x55, 0x71, 0xb0, 0x94, 0xf8, 0xdc, 0x20, 0x4, 0x68, 0x4c, 0x3, 0x27, 0x4b, 0x6f, 0x93, 0xb7, 0xdb, 0xff, 0x3e, 0x1a, 0x76, 0x52, 0xae, 0x8a, 0xe6, 0xc2, 0x79, 0x5d, 0x31, 0x15, 0xe9, 0xcd, 0xa1, 0x85, 0x44, 0x60, 0xc, 0x28, 0xd4, 0xf0, 0x9c, 0xb8, 0x2, 0x26, 0x4a, 0x6e, 0x92, 0xb6, 0xda, 0xfe, 0x3f, 0x1b, 0x77, 0x53, 0xaf, 0x8b, 0xe7, 0xc3, 0x78, 0x5c, 0x30, 0x14, 0xe8, 0xcc, 0xa0, 0x84, 0x45, 0x61, 0xd, 0x29, 0xd5, 0xf1, 0x9d, 0xb9, 0xf6, 0xd2, 0xbe, 0x9a, 0x66, 0x42, 0x2e, 0xa, 0xcb, 0xef, 0x83, 0xa7, 0x5b, 0x7f, 0x13, 0x37, 0x8c, 0xa8, 0xc4, 0xe0, 0x1c, 0x38, 0x54, 0x70, 0xb1, 0x95, 0xf9, 0xdd, 0x21, 0x5, 0x69, 0x4d}, + {0x0, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce, 0x6a, 0x4f, 0x20, 0x5, 0xfe, 0xdb, 0xb4, 0x91, 0x5f, 0x7a, 0x15, 0x30, 0xcb, 0xee, 0x81, 0xa4, 0xd4, 0xf1, 0x9e, 0xbb, 0x40, 0x65, 0xa, 0x2f, 0xe1, 0xc4, 0xab, 0x8e, 0x75, 0x50, 0x3f, 0x1a, 0xbe, 0x9b, 0xf4, 0xd1, 0x2a, 0xf, 0x60, 0x45, 0x8b, 0xae, 0xc1, 0xe4, 0x1f, 0x3a, 0x55, 0x70, 0xb5, 0x90, 0xff, 0xda, 0x21, 0x4, 0x6b, 0x4e, 0x80, 0xa5, 0xca, 0xef, 0x14, 0x31, 0x5e, 0x7b, 0xdf, 0xfa, 0x95, 0xb0, 0x4b, 0x6e, 0x1, 0x24, 0xea, 0xcf, 0xa0, 0x85, 0x7e, 0x5b, 0x34, 0x11, 0x61, 0x44, 0x2b, 0xe, 0xf5, 0xd0, 0xbf, 0x9a, 0x54, 0x71, 0x1e, 0x3b, 0xc0, 0xe5, 0x8a, 0xaf, 0xb, 0x2e, 0x41, 0x64, 0x9f, 0xba, 0xd5, 0xf0, 0x3e, 0x1b, 0x74, 0x51, 0xaa, 0x8f, 0xe0, 0xc5, 0x77, 0x52, 0x3d, 0x18, 0xe3, 0xc6, 0xa9, 0x8c, 0x42, 0x67, 0x8, 0x2d, 0xd6, 0xf3, 0x9c, 0xb9, 0x1d, 0x38, 0x57, 0x72, 0x89, 0xac, 0xc3, 0xe6, 0x28, 0xd, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3, 0xa3, 0x86, 0xe9, 0xcc, 0x37, 0x12, 0x7d, 0x58, 0x96, 0xb3, 0xdc, 0xf9, 0x2, 0x27, 0x48, 0x6d, 0xc9, 0xec, 0x83, 0xa6, 0x5d, 0x78, 0x17, 0x32, 0xfc, 0xd9, 0xb6, 0x93, 0x68, 0x4d, 0x22, 0x7, 0xc2, 0xe7, 0x88, 0xad, 0x56, 0x73, 0x1c, 0x39, 0xf7, 0xd2, 0xbd, 0x98, 0x63, 0x46, 0x29, 0xc, 0xa8, 0x8d, 0xe2, 0xc7, 0x3c, 0x19, 0x76, 0x53, 0x9d, 0xb8, 0xd7, 0xf2, 0x9, 0x2c, 0x43, 0x66, 0x16, 0x33, 0x5c, 0x79, 0x82, 0xa7, 0xc8, 0xed, 0x23, 0x6, 0x69, 0x4c, 0xb7, 0x92, 0xfd, 0xd8, 0x7c, 0x59, 0x36, 0x13, 0xe8, 0xcd, 0xa2, 0x87, 0x49, 0x6c, 0x3, 0x26, 0xdd, 0xf8, 0x97, 0xb2}, + {0x0, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0xb, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf, 0x5a, 0x7c, 0x16, 0x30, 0xc2, 0xe4, 0x8e, 0xa8, 0x77, 0x51, 0x3b, 0x1d, 0xef, 0xc9, 0xa3, 0x85, 0xb4, 0x92, 0xf8, 0xde, 0x2c, 0xa, 0x60, 0x46, 0x99, 0xbf, 0xd5, 0xf3, 0x1, 0x27, 0x4d, 0x6b, 0xee, 0xc8, 0xa2, 0x84, 0x76, 0x50, 0x3a, 0x1c, 0xc3, 0xe5, 0x8f, 0xa9, 0x5b, 0x7d, 0x17, 0x31, 0x75, 0x53, 0x39, 0x1f, 0xed, 0xcb, 0xa1, 0x87, 0x58, 0x7e, 0x14, 0x32, 0xc0, 0xe6, 0x8c, 0xaa, 0x2f, 0x9, 0x63, 0x45, 0xb7, 0x91, 0xfb, 0xdd, 0x2, 0x24, 0x4e, 0x68, 0x9a, 0xbc, 0xd6, 0xf0, 0xc1, 0xe7, 0x8d, 0xab, 0x59, 0x7f, 0x15, 0x33, 0xec, 0xca, 0xa0, 0x86, 0x74, 0x52, 0x38, 0x1e, 0x9b, 0xbd, 0xd7, 0xf1, 0x3, 0x25, 0x4f, 0x69, 0xb6, 0x90, 0xfa, 0xdc, 0x2e, 0x8, 0x62, 0x44, 0xea, 0xcc, 0xa6, 0x80, 0x72, 0x54, 0x3e, 0x18, 0xc7, 0xe1, 0x8b, 0xad, 0x5f, 0x79, 0x13, 0x35, 0xb0, 0x96, 0xfc, 0xda, 0x28, 0xe, 0x64, 0x42, 0x9d, 0xbb, 0xd1, 0xf7, 0x5, 0x23, 0x49, 0x6f, 0x5e, 0x78, 0x12, 0x34, 0xc6, 0xe0, 0x8a, 0xac, 0x73, 0x55, 0x3f, 0x19, 0xeb, 0xcd, 0xa7, 0x81, 0x4, 0x22, 0x48, 0x6e, 0x9c, 0xba, 0xd0, 0xf6, 0x29, 0xf, 0x65, 0x43, 0xb1, 0x97, 0xfd, 0xdb, 0x9f, 0xb9, 0xd3, 0xf5, 0x7, 0x21, 0x4b, 0x6d, 0xb2, 0x94, 0xfe, 0xd8, 0x2a, 0xc, 0x66, 0x40, 0xc5, 0xe3, 0x89, 0xaf, 0x5d, 0x7b, 0x11, 0x37, 0xe8, 0xce, 0xa4, 0x82, 0x70, 0x56, 0x3c, 0x1a, 0x2b, 0xd, 0x67, 0x41, 0xb3, 0x95, 0xff, 0xd9, 0x6, 0x20, 0x4a, 0x6c, 0x9e, 0xb8, 0xd2, 0xf4, 0x71, 0x57, 0x3d, 0x1b, 0xe9, 0xcf, 0xa5, 0x83, 0x5c, 0x7a, 0x10, 0x36, 0xc4, 0xe2, 0x88, 0xae}, + {0x0, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x2, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0, 0x4a, 0x6d, 0x4, 0x23, 0xd6, 0xf1, 0x98, 0xbf, 0x6f, 0x48, 0x21, 0x6, 0xf3, 0xd4, 0xbd, 0x9a, 0x94, 0xb3, 0xda, 0xfd, 0x8, 0x2f, 0x46, 0x61, 0xb1, 0x96, 0xff, 0xd8, 0x2d, 0xa, 0x63, 0x44, 0xde, 0xf9, 0x90, 0xb7, 0x42, 0x65, 0xc, 0x2b, 0xfb, 0xdc, 0xb5, 0x92, 0x67, 0x40, 0x29, 0xe, 0x35, 0x12, 0x7b, 0x5c, 0xa9, 0x8e, 0xe7, 0xc0, 0x10, 0x37, 0x5e, 0x79, 0x8c, 0xab, 0xc2, 0xe5, 0x7f, 0x58, 0x31, 0x16, 0xe3, 0xc4, 0xad, 0x8a, 0x5a, 0x7d, 0x14, 0x33, 0xc6, 0xe1, 0x88, 0xaf, 0xa1, 0x86, 0xef, 0xc8, 0x3d, 0x1a, 0x73, 0x54, 0x84, 0xa3, 0xca, 0xed, 0x18, 0x3f, 0x56, 0x71, 0xeb, 0xcc, 0xa5, 0x82, 0x77, 0x50, 0x39, 0x1e, 0xce, 0xe9, 0x80, 0xa7, 0x52, 0x75, 0x1c, 0x3b, 0x6a, 0x4d, 0x24, 0x3, 0xf6, 0xd1, 0xb8, 0x9f, 0x4f, 0x68, 0x1, 0x26, 0xd3, 0xf4, 0x9d, 0xba, 0x20, 0x7, 0x6e, 0x49, 0xbc, 0x9b, 0xf2, 0xd5, 0x5, 0x22, 0x4b, 0x6c, 0x99, 0xbe, 0xd7, 0xf0, 0xfe, 0xd9, 0xb0, 0x97, 0x62, 0x45, 0x2c, 0xb, 0xdb, 0xfc, 0x95, 0xb2, 0x47, 0x60, 0x9, 0x2e, 0xb4, 0x93, 0xfa, 0xdd, 0x28, 0xf, 0x66, 0x41, 0x91, 0xb6, 0xdf, 0xf8, 0xd, 0x2a, 0x43, 0x64, 0x5f, 0x78, 0x11, 0x36, 0xc3, 0xe4, 0x8d, 0xaa, 0x7a, 0x5d, 0x34, 0x13, 0xe6, 0xc1, 0xa8, 0x8f, 0x15, 0x32, 0x5b, 0x7c, 0x89, 0xae, 0xc7, 0xe0, 0x30, 0x17, 0x7e, 0x59, 0xac, 0x8b, 0xe2, 0xc5, 0xcb, 0xec, 0x85, 0xa2, 0x57, 0x70, 0x19, 0x3e, 0xee, 0xc9, 0xa0, 0x87, 0x72, 0x55, 0x3c, 0x1b, 0x81, 0xa6, 0xcf, 0xe8, 0x1d, 0x3a, 0x53, 0x74, 0xa4, 0x83, 0xea, 0xcd, 0x38, 0x1f, 0x76, 0x51}, + {0x0, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0xd, 0x25, 0xfd, 0xd5, 0xad, 0x85, 0xba, 0x92, 0xea, 0xc2, 0x1a, 0x32, 0x4a, 0x62, 0xe7, 0xcf, 0xb7, 0x9f, 0x47, 0x6f, 0x17, 0x3f, 0x69, 0x41, 0x39, 0x11, 0xc9, 0xe1, 0x99, 0xb1, 0x34, 0x1c, 0x64, 0x4c, 0x94, 0xbc, 0xc4, 0xec, 0xd3, 0xfb, 0x83, 0xab, 0x73, 0x5b, 0x23, 0xb, 0x8e, 0xa6, 0xde, 0xf6, 0x2e, 0x6, 0x7e, 0x56, 0xd2, 0xfa, 0x82, 0xaa, 0x72, 0x5a, 0x22, 0xa, 0x8f, 0xa7, 0xdf, 0xf7, 0x2f, 0x7, 0x7f, 0x57, 0x68, 0x40, 0x38, 0x10, 0xc8, 0xe0, 0x98, 0xb0, 0x35, 0x1d, 0x65, 0x4d, 0x95, 0xbd, 0xc5, 0xed, 0xbb, 0x93, 0xeb, 0xc3, 0x1b, 0x33, 0x4b, 0x63, 0xe6, 0xce, 0xb6, 0x9e, 0x46, 0x6e, 0x16, 0x3e, 0x1, 0x29, 0x51, 0x79, 0xa1, 0x89, 0xf1, 0xd9, 0x5c, 0x74, 0xc, 0x24, 0xfc, 0xd4, 0xac, 0x84, 0xb9, 0x91, 0xe9, 0xc1, 0x19, 0x31, 0x49, 0x61, 0xe4, 0xcc, 0xb4, 0x9c, 0x44, 0x6c, 0x14, 0x3c, 0x3, 0x2b, 0x53, 0x7b, 0xa3, 0x8b, 0xf3, 0xdb, 0x5e, 0x76, 0xe, 0x26, 0xfe, 0xd6, 0xae, 0x86, 0xd0, 0xf8, 0x80, 0xa8, 0x70, 0x58, 0x20, 0x8, 0x8d, 0xa5, 0xdd, 0xf5, 0x2d, 0x5, 0x7d, 0x55, 0x6a, 0x42, 0x3a, 0x12, 0xca, 0xe2, 0x9a, 0xb2, 0x37, 0x1f, 0x67, 0x4f, 0x97, 0xbf, 0xc7, 0xef, 0x6b, 0x43, 0x3b, 0x13, 0xcb, 0xe3, 0x9b, 0xb3, 0x36, 0x1e, 0x66, 0x4e, 0x96, 0xbe, 0xc6, 0xee, 0xd1, 0xf9, 0x81, 0xa9, 0x71, 0x59, 0x21, 0x9, 0x8c, 0xa4, 0xdc, 0xf4, 0x2c, 0x4, 0x7c, 0x54, 0x2, 0x2a, 0x52, 0x7a, 0xa2, 0x8a, 0xf2, 0xda, 0x5f, 0x77, 0xf, 0x27, 0xff, 0xd7, 0xaf, 0x87, 0xb8, 0x90, 0xe8, 0xc0, 0x18, 0x30, 0x48, 0x60, 0xe5, 0xcd, 0xb5, 0x9d, 0x45, 0x6d, 0x15, 0x3d}, + {0x0, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x7, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a, 0xaa, 0x83, 0xf8, 0xd1, 0xe, 0x27, 0x5c, 0x75, 0xff, 0xd6, 0xad, 0x84, 0x5b, 0x72, 0x9, 0x20, 0x49, 0x60, 0x1b, 0x32, 0xed, 0xc4, 0xbf, 0x96, 0x1c, 0x35, 0x4e, 0x67, 0xb8, 0x91, 0xea, 0xc3, 0xe3, 0xca, 0xb1, 0x98, 0x47, 0x6e, 0x15, 0x3c, 0xb6, 0x9f, 0xe4, 0xcd, 0x12, 0x3b, 0x40, 0x69, 0x92, 0xbb, 0xc0, 0xe9, 0x36, 0x1f, 0x64, 0x4d, 0xc7, 0xee, 0x95, 0xbc, 0x63, 0x4a, 0x31, 0x18, 0x38, 0x11, 0x6a, 0x43, 0x9c, 0xb5, 0xce, 0xe7, 0x6d, 0x44, 0x3f, 0x16, 0xc9, 0xe0, 0x9b, 0xb2, 0xdb, 0xf2, 0x89, 0xa0, 0x7f, 0x56, 0x2d, 0x4, 0x8e, 0xa7, 0xdc, 0xf5, 0x2a, 0x3, 0x78, 0x51, 0x71, 0x58, 0x23, 0xa, 0xd5, 0xfc, 0x87, 0xae, 0x24, 0xd, 0x76, 0x5f, 0x80, 0xa9, 0xd2, 0xfb, 0x39, 0x10, 0x6b, 0x42, 0x9d, 0xb4, 0xcf, 0xe6, 0x6c, 0x45, 0x3e, 0x17, 0xc8, 0xe1, 0x9a, 0xb3, 0x93, 0xba, 0xc1, 0xe8, 0x37, 0x1e, 0x65, 0x4c, 0xc6, 0xef, 0x94, 0xbd, 0x62, 0x4b, 0x30, 0x19, 0x70, 0x59, 0x22, 0xb, 0xd4, 0xfd, 0x86, 0xaf, 0x25, 0xc, 0x77, 0x5e, 0x81, 0xa8, 0xd3, 0xfa, 0xda, 0xf3, 0x88, 0xa1, 0x7e, 0x57, 0x2c, 0x5, 0x8f, 0xa6, 0xdd, 0xf4, 0x2b, 0x2, 0x79, 0x50, 0xab, 0x82, 0xf9, 0xd0, 0xf, 0x26, 0x5d, 0x74, 0xfe, 0xd7, 0xac, 0x85, 0x5a, 0x73, 0x8, 0x21, 0x1, 0x28, 0x53, 0x7a, 0xa5, 0x8c, 0xf7, 0xde, 0x54, 0x7d, 0x6, 0x2f, 0xf0, 0xd9, 0xa2, 0x8b, 0xe2, 0xcb, 0xb0, 0x99, 0x46, 0x6f, 0x14, 0x3d, 0xb7, 0x9e, 0xe5, 0xcc, 0x13, 0x3a, 0x41, 0x68, 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97, 0x1d, 0x34, 0x4f, 0x66, 0xb9, 0x90, 0xeb, 0xc2}, + {0x0, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b, 0x9a, 0xb0, 0xce, 0xe4, 0x32, 0x18, 0x66, 0x4c, 0xd7, 0xfd, 0x83, 0xa9, 0x7f, 0x55, 0x2b, 0x1, 0x29, 0x3, 0x7d, 0x57, 0x81, 0xab, 0xd5, 0xff, 0x64, 0x4e, 0x30, 0x1a, 0xcc, 0xe6, 0x98, 0xb2, 0xb3, 0x99, 0xe7, 0xcd, 0x1b, 0x31, 0x4f, 0x65, 0xfe, 0xd4, 0xaa, 0x80, 0x56, 0x7c, 0x2, 0x28, 0x52, 0x78, 0x6, 0x2c, 0xfa, 0xd0, 0xae, 0x84, 0x1f, 0x35, 0x4b, 0x61, 0xb7, 0x9d, 0xe3, 0xc9, 0xc8, 0xe2, 0x9c, 0xb6, 0x60, 0x4a, 0x34, 0x1e, 0x85, 0xaf, 0xd1, 0xfb, 0x2d, 0x7, 0x79, 0x53, 0x7b, 0x51, 0x2f, 0x5, 0xd3, 0xf9, 0x87, 0xad, 0x36, 0x1c, 0x62, 0x48, 0x9e, 0xb4, 0xca, 0xe0, 0xe1, 0xcb, 0xb5, 0x9f, 0x49, 0x63, 0x1d, 0x37, 0xac, 0x86, 0xf8, 0xd2, 0x4, 0x2e, 0x50, 0x7a, 0xa4, 0x8e, 0xf0, 0xda, 0xc, 0x26, 0x58, 0x72, 0xe9, 0xc3, 0xbd, 0x97, 0x41, 0x6b, 0x15, 0x3f, 0x3e, 0x14, 0x6a, 0x40, 0x96, 0xbc, 0xc2, 0xe8, 0x73, 0x59, 0x27, 0xd, 0xdb, 0xf1, 0x8f, 0xa5, 0x8d, 0xa7, 0xd9, 0xf3, 0x25, 0xf, 0x71, 0x5b, 0xc0, 0xea, 0x94, 0xbe, 0x68, 0x42, 0x3c, 0x16, 0x17, 0x3d, 0x43, 0x69, 0xbf, 0x95, 0xeb, 0xc1, 0x5a, 0x70, 0xe, 0x24, 0xf2, 0xd8, 0xa6, 0x8c, 0xf6, 0xdc, 0xa2, 0x88, 0x5e, 0x74, 0xa, 0x20, 0xbb, 0x91, 0xef, 0xc5, 0x13, 0x39, 0x47, 0x6d, 0x6c, 0x46, 0x38, 0x12, 0xc4, 0xee, 0x90, 0xba, 0x21, 0xb, 0x75, 0x5f, 0x89, 0xa3, 0xdd, 0xf7, 0xdf, 0xf5, 0x8b, 0xa1, 0x77, 0x5d, 0x23, 0x9, 0x92, 0xb8, 0xc6, 0xec, 0x3a, 0x10, 0x6e, 0x44, 0x45, 0x6f, 0x11, 0x3b, 0xed, 0xc7, 0xb9, 0x93, 0x8, 0x22, 0x5c, 0x76, 0xa0, 0x8a, 0xf4, 0xde}, + {0x0, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94, 0x8a, 0xa1, 0xdc, 0xf7, 0x26, 0xd, 0x70, 0x5b, 0xcf, 0xe4, 0x99, 0xb2, 0x63, 0x48, 0x35, 0x1e, 0x9, 0x22, 0x5f, 0x74, 0xa5, 0x8e, 0xf3, 0xd8, 0x4c, 0x67, 0x1a, 0x31, 0xe0, 0xcb, 0xb6, 0x9d, 0x83, 0xa8, 0xd5, 0xfe, 0x2f, 0x4, 0x79, 0x52, 0xc6, 0xed, 0x90, 0xbb, 0x6a, 0x41, 0x3c, 0x17, 0x12, 0x39, 0x44, 0x6f, 0xbe, 0x95, 0xe8, 0xc3, 0x57, 0x7c, 0x1, 0x2a, 0xfb, 0xd0, 0xad, 0x86, 0x98, 0xb3, 0xce, 0xe5, 0x34, 0x1f, 0x62, 0x49, 0xdd, 0xf6, 0x8b, 0xa0, 0x71, 0x5a, 0x27, 0xc, 0x1b, 0x30, 0x4d, 0x66, 0xb7, 0x9c, 0xe1, 0xca, 0x5e, 0x75, 0x8, 0x23, 0xf2, 0xd9, 0xa4, 0x8f, 0x91, 0xba, 0xc7, 0xec, 0x3d, 0x16, 0x6b, 0x40, 0xd4, 0xff, 0x82, 0xa9, 0x78, 0x53, 0x2e, 0x5, 0x24, 0xf, 0x72, 0x59, 0x88, 0xa3, 0xde, 0xf5, 0x61, 0x4a, 0x37, 0x1c, 0xcd, 0xe6, 0x9b, 0xb0, 0xae, 0x85, 0xf8, 0xd3, 0x2, 0x29, 0x54, 0x7f, 0xeb, 0xc0, 0xbd, 0x96, 0x47, 0x6c, 0x11, 0x3a, 0x2d, 0x6, 0x7b, 0x50, 0x81, 0xaa, 0xd7, 0xfc, 0x68, 0x43, 0x3e, 0x15, 0xc4, 0xef, 0x92, 0xb9, 0xa7, 0x8c, 0xf1, 0xda, 0xb, 0x20, 0x5d, 0x76, 0xe2, 0xc9, 0xb4, 0x9f, 0x4e, 0x65, 0x18, 0x33, 0x36, 0x1d, 0x60, 0x4b, 0x9a, 0xb1, 0xcc, 0xe7, 0x73, 0x58, 0x25, 0xe, 0xdf, 0xf4, 0x89, 0xa2, 0xbc, 0x97, 0xea, 0xc1, 0x10, 0x3b, 0x46, 0x6d, 0xf9, 0xd2, 0xaf, 0x84, 0x55, 0x7e, 0x3, 0x28, 0x3f, 0x14, 0x69, 0x42, 0x93, 0xb8, 0xc5, 0xee, 0x7a, 0x51, 0x2c, 0x7, 0xd6, 0xfd, 0x80, 0xab, 0xb5, 0x9e, 0xe3, 0xc8, 0x19, 0x32, 0x4f, 0x64, 0xf0, 0xdb, 0xa6, 0x8d, 0x5c, 0x77, 0xa, 0x21}, + {0x0, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x9, 0xcd, 0xe1, 0x95, 0xb9, 0xfa, 0xd6, 0xa2, 0x8e, 0x4a, 0x66, 0x12, 0x3e, 0x87, 0xab, 0xdf, 0xf3, 0x37, 0x1b, 0x6f, 0x43, 0xe9, 0xc5, 0xb1, 0x9d, 0x59, 0x75, 0x1, 0x2d, 0x94, 0xb8, 0xcc, 0xe0, 0x24, 0x8, 0x7c, 0x50, 0x13, 0x3f, 0x4b, 0x67, 0xa3, 0x8f, 0xfb, 0xd7, 0x6e, 0x42, 0x36, 0x1a, 0xde, 0xf2, 0x86, 0xaa, 0xcf, 0xe3, 0x97, 0xbb, 0x7f, 0x53, 0x27, 0xb, 0xb2, 0x9e, 0xea, 0xc6, 0x2, 0x2e, 0x5a, 0x76, 0x35, 0x19, 0x6d, 0x41, 0x85, 0xa9, 0xdd, 0xf1, 0x48, 0x64, 0x10, 0x3c, 0xf8, 0xd4, 0xa0, 0x8c, 0x26, 0xa, 0x7e, 0x52, 0x96, 0xba, 0xce, 0xe2, 0x5b, 0x77, 0x3, 0x2f, 0xeb, 0xc7, 0xb3, 0x9f, 0xdc, 0xf0, 0x84, 0xa8, 0x6c, 0x40, 0x34, 0x18, 0xa1, 0x8d, 0xf9, 0xd5, 0x11, 0x3d, 0x49, 0x65, 0x83, 0xaf, 0xdb, 0xf7, 0x33, 0x1f, 0x6b, 0x47, 0xfe, 0xd2, 0xa6, 0x8a, 0x4e, 0x62, 0x16, 0x3a, 0x79, 0x55, 0x21, 0xd, 0xc9, 0xe5, 0x91, 0xbd, 0x4, 0x28, 0x5c, 0x70, 0xb4, 0x98, 0xec, 0xc0, 0x6a, 0x46, 0x32, 0x1e, 0xda, 0xf6, 0x82, 0xae, 0x17, 0x3b, 0x4f, 0x63, 0xa7, 0x8b, 0xff, 0xd3, 0x90, 0xbc, 0xc8, 0xe4, 0x20, 0xc, 0x78, 0x54, 0xed, 0xc1, 0xb5, 0x99, 0x5d, 0x71, 0x5, 0x29, 0x4c, 0x60, 0x14, 0x38, 0xfc, 0xd0, 0xa4, 0x88, 0x31, 0x1d, 0x69, 0x45, 0x81, 0xad, 0xd9, 0xf5, 0xb6, 0x9a, 0xee, 0xc2, 0x6, 0x2a, 0x5e, 0x72, 0xcb, 0xe7, 0x93, 0xbf, 0x7b, 0x57, 0x23, 0xf, 0xa5, 0x89, 0xfd, 0xd1, 0x15, 0x39, 0x4d, 0x61, 0xd8, 0xf4, 0x80, 0xac, 0x68, 0x44, 0x30, 0x1c, 0x5f, 0x73, 0x7, 0x2b, 0xef, 0xc3, 0xb7, 0x9b, 0x22, 0xe, 0x7a, 0x56, 0x92, 0xbe, 0xca, 0xe6}, + {0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x2, 0xc1, 0xec, 0x9b, 0xb6, 0xea, 0xc7, 0xb0, 0x9d, 0x5e, 0x73, 0x4, 0x29, 0x9f, 0xb2, 0xc5, 0xe8, 0x2b, 0x6, 0x71, 0x5c, 0xc9, 0xe4, 0x93, 0xbe, 0x7d, 0x50, 0x27, 0xa, 0xbc, 0x91, 0xe6, 0xcb, 0x8, 0x25, 0x52, 0x7f, 0x23, 0xe, 0x79, 0x54, 0x97, 0xba, 0xcd, 0xe0, 0x56, 0x7b, 0xc, 0x21, 0xe2, 0xcf, 0xb8, 0x95, 0x8f, 0xa2, 0xd5, 0xf8, 0x3b, 0x16, 0x61, 0x4c, 0xfa, 0xd7, 0xa0, 0x8d, 0x4e, 0x63, 0x14, 0x39, 0x65, 0x48, 0x3f, 0x12, 0xd1, 0xfc, 0x8b, 0xa6, 0x10, 0x3d, 0x4a, 0x67, 0xa4, 0x89, 0xfe, 0xd3, 0x46, 0x6b, 0x1c, 0x31, 0xf2, 0xdf, 0xa8, 0x85, 0x33, 0x1e, 0x69, 0x44, 0x87, 0xaa, 0xdd, 0xf0, 0xac, 0x81, 0xf6, 0xdb, 0x18, 0x35, 0x42, 0x6f, 0xd9, 0xf4, 0x83, 0xae, 0x6d, 0x40, 0x37, 0x1a, 0x3, 0x2e, 0x59, 0x74, 0xb7, 0x9a, 0xed, 0xc0, 0x76, 0x5b, 0x2c, 0x1, 0xc2, 0xef, 0x98, 0xb5, 0xe9, 0xc4, 0xb3, 0x9e, 0x5d, 0x70, 0x7, 0x2a, 0x9c, 0xb1, 0xc6, 0xeb, 0x28, 0x5, 0x72, 0x5f, 0xca, 0xe7, 0x90, 0xbd, 0x7e, 0x53, 0x24, 0x9, 0xbf, 0x92, 0xe5, 0xc8, 0xb, 0x26, 0x51, 0x7c, 0x20, 0xd, 0x7a, 0x57, 0x94, 0xb9, 0xce, 0xe3, 0x55, 0x78, 0xf, 0x22, 0xe1, 0xcc, 0xbb, 0x96, 0x8c, 0xa1, 0xd6, 0xfb, 0x38, 0x15, 0x62, 0x4f, 0xf9, 0xd4, 0xa3, 0x8e, 0x4d, 0x60, 0x17, 0x3a, 0x66, 0x4b, 0x3c, 0x11, 0xd2, 0xff, 0x88, 0xa5, 0x13, 0x3e, 0x49, 0x64, 0xa7, 0x8a, 0xfd, 0xd0, 0x45, 0x68, 0x1f, 0x32, 0xf1, 0xdc, 0xab, 0x86, 0x30, 0x1d, 0x6a, 0x47, 0x84, 0xa9, 0xde, 0xf3, 0xaf, 0x82, 0xf5, 0xd8, 0x1b, 0x36, 0x41, 0x6c, 0xda, 0xf7, 0x80, 0xad, 0x6e, 0x43, 0x34, 0x19}, + {0x0, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7, 0xda, 0xf4, 0x86, 0xa8, 0x62, 0x4c, 0x3e, 0x10, 0xb7, 0x99, 0xeb, 0xc5, 0xf, 0x21, 0x53, 0x7d, 0xa9, 0x87, 0xf5, 0xdb, 0x11, 0x3f, 0x4d, 0x63, 0xc4, 0xea, 0x98, 0xb6, 0x7c, 0x52, 0x20, 0xe, 0x73, 0x5d, 0x2f, 0x1, 0xcb, 0xe5, 0x97, 0xb9, 0x1e, 0x30, 0x42, 0x6c, 0xa6, 0x88, 0xfa, 0xd4, 0x4f, 0x61, 0x13, 0x3d, 0xf7, 0xd9, 0xab, 0x85, 0x22, 0xc, 0x7e, 0x50, 0x9a, 0xb4, 0xc6, 0xe8, 0x95, 0xbb, 0xc9, 0xe7, 0x2d, 0x3, 0x71, 0x5f, 0xf8, 0xd6, 0xa4, 0x8a, 0x40, 0x6e, 0x1c, 0x32, 0xe6, 0xc8, 0xba, 0x94, 0x5e, 0x70, 0x2, 0x2c, 0x8b, 0xa5, 0xd7, 0xf9, 0x33, 0x1d, 0x6f, 0x41, 0x3c, 0x12, 0x60, 0x4e, 0x84, 0xaa, 0xd8, 0xf6, 0x51, 0x7f, 0xd, 0x23, 0xe9, 0xc7, 0xb5, 0x9b, 0x9e, 0xb0, 0xc2, 0xec, 0x26, 0x8, 0x7a, 0x54, 0xf3, 0xdd, 0xaf, 0x81, 0x4b, 0x65, 0x17, 0x39, 0x44, 0x6a, 0x18, 0x36, 0xfc, 0xd2, 0xa0, 0x8e, 0x29, 0x7, 0x75, 0x5b, 0x91, 0xbf, 0xcd, 0xe3, 0x37, 0x19, 0x6b, 0x45, 0x8f, 0xa1, 0xd3, 0xfd, 0x5a, 0x74, 0x6, 0x28, 0xe2, 0xcc, 0xbe, 0x90, 0xed, 0xc3, 0xb1, 0x9f, 0x55, 0x7b, 0x9, 0x27, 0x80, 0xae, 0xdc, 0xf2, 0x38, 0x16, 0x64, 0x4a, 0xd1, 0xff, 0x8d, 0xa3, 0x69, 0x47, 0x35, 0x1b, 0xbc, 0x92, 0xe0, 0xce, 0x4, 0x2a, 0x58, 0x76, 0xb, 0x25, 0x57, 0x79, 0xb3, 0x9d, 0xef, 0xc1, 0x66, 0x48, 0x3a, 0x14, 0xde, 0xf0, 0x82, 0xac, 0x78, 0x56, 0x24, 0xa, 0xc0, 0xee, 0x9c, 0xb2, 0x15, 0x3b, 0x49, 0x67, 0xad, 0x83, 0xf1, 0xdf, 0xa2, 0x8c, 0xfe, 0xd0, 0x1a, 0x34, 0x46, 0x68, 0xcf, 0xe1, 0x93, 0xbd, 0x77, 0x59, 0x2b, 0x5}, + {0x0, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8, 0xca, 0xe5, 0x94, 0xbb, 0x76, 0x59, 0x28, 0x7, 0xaf, 0x80, 0xf1, 0xde, 0x13, 0x3c, 0x4d, 0x62, 0x89, 0xa6, 0xd7, 0xf8, 0x35, 0x1a, 0x6b, 0x44, 0xec, 0xc3, 0xb2, 0x9d, 0x50, 0x7f, 0xe, 0x21, 0x43, 0x6c, 0x1d, 0x32, 0xff, 0xd0, 0xa1, 0x8e, 0x26, 0x9, 0x78, 0x57, 0x9a, 0xb5, 0xc4, 0xeb, 0xf, 0x20, 0x51, 0x7e, 0xb3, 0x9c, 0xed, 0xc2, 0x6a, 0x45, 0x34, 0x1b, 0xd6, 0xf9, 0x88, 0xa7, 0xc5, 0xea, 0x9b, 0xb4, 0x79, 0x56, 0x27, 0x8, 0xa0, 0x8f, 0xfe, 0xd1, 0x1c, 0x33, 0x42, 0x6d, 0x86, 0xa9, 0xd8, 0xf7, 0x3a, 0x15, 0x64, 0x4b, 0xe3, 0xcc, 0xbd, 0x92, 0x5f, 0x70, 0x1, 0x2e, 0x4c, 0x63, 0x12, 0x3d, 0xf0, 0xdf, 0xae, 0x81, 0x29, 0x6, 0x77, 0x58, 0x95, 0xba, 0xcb, 0xe4, 0x1e, 0x31, 0x40, 0x6f, 0xa2, 0x8d, 0xfc, 0xd3, 0x7b, 0x54, 0x25, 0xa, 0xc7, 0xe8, 0x99, 0xb6, 0xd4, 0xfb, 0x8a, 0xa5, 0x68, 0x47, 0x36, 0x19, 0xb1, 0x9e, 0xef, 0xc0, 0xd, 0x22, 0x53, 0x7c, 0x97, 0xb8, 0xc9, 0xe6, 0x2b, 0x4, 0x75, 0x5a, 0xf2, 0xdd, 0xac, 0x83, 0x4e, 0x61, 0x10, 0x3f, 0x5d, 0x72, 0x3, 0x2c, 0xe1, 0xce, 0xbf, 0x90, 0x38, 0x17, 0x66, 0x49, 0x84, 0xab, 0xda, 0xf5, 0x11, 0x3e, 0x4f, 0x60, 0xad, 0x82, 0xf3, 0xdc, 0x74, 0x5b, 0x2a, 0x5, 0xc8, 0xe7, 0x96, 0xb9, 0xdb, 0xf4, 0x85, 0xaa, 0x67, 0x48, 0x39, 0x16, 0xbe, 0x91, 0xe0, 0xcf, 0x2, 0x2d, 0x5c, 0x73, 0x98, 0xb7, 0xc6, 0xe9, 0x24, 0xb, 0x7a, 0x55, 0xfd, 0xd2, 0xa3, 0x8c, 0x41, 0x6e, 0x1f, 0x30, 0x52, 0x7d, 0xc, 0x23, 0xee, 0xc1, 0xb0, 0x9f, 0x37, 0x18, 0x69, 0x46, 0x8b, 0xa4, 0xd5, 0xfa}, + {0x0, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0xd, 0x27, 0x17, 0x47, 0x77, 0xe7, 0xd7, 0x87, 0xb7, 0xba, 0x8a, 0xda, 0xea, 0x7a, 0x4a, 0x1a, 0x2a, 0x4e, 0x7e, 0x2e, 0x1e, 0x8e, 0xbe, 0xee, 0xde, 0xd3, 0xe3, 0xb3, 0x83, 0x13, 0x23, 0x73, 0x43, 0x69, 0x59, 0x9, 0x39, 0xa9, 0x99, 0xc9, 0xf9, 0xf4, 0xc4, 0x94, 0xa4, 0x34, 0x4, 0x54, 0x64, 0x9c, 0xac, 0xfc, 0xcc, 0x5c, 0x6c, 0x3c, 0xc, 0x1, 0x31, 0x61, 0x51, 0xc1, 0xf1, 0xa1, 0x91, 0xbb, 0x8b, 0xdb, 0xeb, 0x7b, 0x4b, 0x1b, 0x2b, 0x26, 0x16, 0x46, 0x76, 0xe6, 0xd6, 0x86, 0xb6, 0xd2, 0xe2, 0xb2, 0x82, 0x12, 0x22, 0x72, 0x42, 0x4f, 0x7f, 0x2f, 0x1f, 0x8f, 0xbf, 0xef, 0xdf, 0xf5, 0xc5, 0x95, 0xa5, 0x35, 0x5, 0x55, 0x65, 0x68, 0x58, 0x8, 0x38, 0xa8, 0x98, 0xc8, 0xf8, 0x25, 0x15, 0x45, 0x75, 0xe5, 0xd5, 0x85, 0xb5, 0xb8, 0x88, 0xd8, 0xe8, 0x78, 0x48, 0x18, 0x28, 0x2, 0x32, 0x62, 0x52, 0xc2, 0xf2, 0xa2, 0x92, 0x9f, 0xaf, 0xff, 0xcf, 0x5f, 0x6f, 0x3f, 0xf, 0x6b, 0x5b, 0xb, 0x3b, 0xab, 0x9b, 0xcb, 0xfb, 0xf6, 0xc6, 0x96, 0xa6, 0x36, 0x6, 0x56, 0x66, 0x4c, 0x7c, 0x2c, 0x1c, 0x8c, 0xbc, 0xec, 0xdc, 0xd1, 0xe1, 0xb1, 0x81, 0x11, 0x21, 0x71, 0x41, 0xb9, 0x89, 0xd9, 0xe9, 0x79, 0x49, 0x19, 0x29, 0x24, 0x14, 0x44, 0x74, 0xe4, 0xd4, 0x84, 0xb4, 0x9e, 0xae, 0xfe, 0xce, 0x5e, 0x6e, 0x3e, 0xe, 0x3, 0x33, 0x63, 0x53, 0xc3, 0xf3, 0xa3, 0x93, 0xf7, 0xc7, 0x97, 0xa7, 0x37, 0x7, 0x57, 0x67, 0x6a, 0x5a, 0xa, 0x3a, 0xaa, 0x9a, 0xca, 0xfa, 0xd0, 0xe0, 0xb0, 0x80, 0x10, 0x20, 0x70, 0x40, 0x4d, 0x7d, 0x2d, 0x1d, 0x8d, 0xbd, 0xed, 0xdd}, + {0x0, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x2, 0x37, 0x6, 0x55, 0x64, 0xf3, 0xc2, 0x91, 0xa0, 0xa2, 0x93, 0xc0, 0xf1, 0x66, 0x57, 0x4, 0x35, 0x6e, 0x5f, 0xc, 0x3d, 0xaa, 0x9b, 0xc8, 0xf9, 0xfb, 0xca, 0x99, 0xa8, 0x3f, 0xe, 0x5d, 0x6c, 0x59, 0x68, 0x3b, 0xa, 0x9d, 0xac, 0xff, 0xce, 0xcc, 0xfd, 0xae, 0x9f, 0x8, 0x39, 0x6a, 0x5b, 0xdc, 0xed, 0xbe, 0x8f, 0x18, 0x29, 0x7a, 0x4b, 0x49, 0x78, 0x2b, 0x1a, 0x8d, 0xbc, 0xef, 0xde, 0xeb, 0xda, 0x89, 0xb8, 0x2f, 0x1e, 0x4d, 0x7c, 0x7e, 0x4f, 0x1c, 0x2d, 0xba, 0x8b, 0xd8, 0xe9, 0xb2, 0x83, 0xd0, 0xe1, 0x76, 0x47, 0x14, 0x25, 0x27, 0x16, 0x45, 0x74, 0xe3, 0xd2, 0x81, 0xb0, 0x85, 0xb4, 0xe7, 0xd6, 0x41, 0x70, 0x23, 0x12, 0x10, 0x21, 0x72, 0x43, 0xd4, 0xe5, 0xb6, 0x87, 0xa5, 0x94, 0xc7, 0xf6, 0x61, 0x50, 0x3, 0x32, 0x30, 0x1, 0x52, 0x63, 0xf4, 0xc5, 0x96, 0xa7, 0x92, 0xa3, 0xf0, 0xc1, 0x56, 0x67, 0x34, 0x5, 0x7, 0x36, 0x65, 0x54, 0xc3, 0xf2, 0xa1, 0x90, 0xcb, 0xfa, 0xa9, 0x98, 0xf, 0x3e, 0x6d, 0x5c, 0x5e, 0x6f, 0x3c, 0xd, 0x9a, 0xab, 0xf8, 0xc9, 0xfc, 0xcd, 0x9e, 0xaf, 0x38, 0x9, 0x5a, 0x6b, 0x69, 0x58, 0xb, 0x3a, 0xad, 0x9c, 0xcf, 0xfe, 0x79, 0x48, 0x1b, 0x2a, 0xbd, 0x8c, 0xdf, 0xee, 0xec, 0xdd, 0x8e, 0xbf, 0x28, 0x19, 0x4a, 0x7b, 0x4e, 0x7f, 0x2c, 0x1d, 0x8a, 0xbb, 0xe8, 0xd9, 0xdb, 0xea, 0xb9, 0x88, 0x1f, 0x2e, 0x7d, 0x4c, 0x17, 0x26, 0x75, 0x44, 0xd3, 0xe2, 0xb1, 0x80, 0x82, 0xb3, 0xe0, 0xd1, 0x46, 0x77, 0x24, 0x15, 0x20, 0x11, 0x42, 0x73, 0xe4, 0xd5, 0x86, 0xb7, 0xb5, 0x84, 0xd7, 0xe6, 0x71, 0x40, 0x13, 0x22}, + {0x0, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13, 0x7, 0x35, 0x63, 0x51, 0xcf, 0xfd, 0xab, 0x99, 0x8a, 0xb8, 0xee, 0xdc, 0x42, 0x70, 0x26, 0x14, 0xe, 0x3c, 0x6a, 0x58, 0xc6, 0xf4, 0xa2, 0x90, 0x83, 0xb1, 0xe7, 0xd5, 0x4b, 0x79, 0x2f, 0x1d, 0x9, 0x3b, 0x6d, 0x5f, 0xc1, 0xf3, 0xa5, 0x97, 0x84, 0xb6, 0xe0, 0xd2, 0x4c, 0x7e, 0x28, 0x1a, 0x1c, 0x2e, 0x78, 0x4a, 0xd4, 0xe6, 0xb0, 0x82, 0x91, 0xa3, 0xf5, 0xc7, 0x59, 0x6b, 0x3d, 0xf, 0x1b, 0x29, 0x7f, 0x4d, 0xd3, 0xe1, 0xb7, 0x85, 0x96, 0xa4, 0xf2, 0xc0, 0x5e, 0x6c, 0x3a, 0x8, 0x12, 0x20, 0x76, 0x44, 0xda, 0xe8, 0xbe, 0x8c, 0x9f, 0xad, 0xfb, 0xc9, 0x57, 0x65, 0x33, 0x1, 0x15, 0x27, 0x71, 0x43, 0xdd, 0xef, 0xb9, 0x8b, 0x98, 0xaa, 0xfc, 0xce, 0x50, 0x62, 0x34, 0x6, 0x38, 0xa, 0x5c, 0x6e, 0xf0, 0xc2, 0x94, 0xa6, 0xb5, 0x87, 0xd1, 0xe3, 0x7d, 0x4f, 0x19, 0x2b, 0x3f, 0xd, 0x5b, 0x69, 0xf7, 0xc5, 0x93, 0xa1, 0xb2, 0x80, 0xd6, 0xe4, 0x7a, 0x48, 0x1e, 0x2c, 0x36, 0x4, 0x52, 0x60, 0xfe, 0xcc, 0x9a, 0xa8, 0xbb, 0x89, 0xdf, 0xed, 0x73, 0x41, 0x17, 0x25, 0x31, 0x3, 0x55, 0x67, 0xf9, 0xcb, 0x9d, 0xaf, 0xbc, 0x8e, 0xd8, 0xea, 0x74, 0x46, 0x10, 0x22, 0x24, 0x16, 0x40, 0x72, 0xec, 0xde, 0x88, 0xba, 0xa9, 0x9b, 0xcd, 0xff, 0x61, 0x53, 0x5, 0x37, 0x23, 0x11, 0x47, 0x75, 0xeb, 0xd9, 0x8f, 0xbd, 0xae, 0x9c, 0xca, 0xf8, 0x66, 0x54, 0x2, 0x30, 0x2a, 0x18, 0x4e, 0x7c, 0xe2, 0xd0, 0x86, 0xb4, 0xa7, 0x95, 0xc3, 0xf1, 0x6f, 0x5d, 0xb, 0x39, 0x2d, 0x1f, 0x49, 0x7b, 0xe5, 0xd7, 0x81, 0xb3, 0xa0, 0x92, 0xc4, 0xf6, 0x68, 0x5a, 0xc, 0x3e}, + {0x0, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c, 0x17, 0x24, 0x71, 0x42, 0xdb, 0xe8, 0xbd, 0x8e, 0x92, 0xa1, 0xf4, 0xc7, 0x5e, 0x6d, 0x38, 0xb, 0x2e, 0x1d, 0x48, 0x7b, 0xe2, 0xd1, 0x84, 0xb7, 0xab, 0x98, 0xcd, 0xfe, 0x67, 0x54, 0x1, 0x32, 0x39, 0xa, 0x5f, 0x6c, 0xf5, 0xc6, 0x93, 0xa0, 0xbc, 0x8f, 0xda, 0xe9, 0x70, 0x43, 0x16, 0x25, 0x5c, 0x6f, 0x3a, 0x9, 0x90, 0xa3, 0xf6, 0xc5, 0xd9, 0xea, 0xbf, 0x8c, 0x15, 0x26, 0x73, 0x40, 0x4b, 0x78, 0x2d, 0x1e, 0x87, 0xb4, 0xe1, 0xd2, 0xce, 0xfd, 0xa8, 0x9b, 0x2, 0x31, 0x64, 0x57, 0x72, 0x41, 0x14, 0x27, 0xbe, 0x8d, 0xd8, 0xeb, 0xf7, 0xc4, 0x91, 0xa2, 0x3b, 0x8, 0x5d, 0x6e, 0x65, 0x56, 0x3, 0x30, 0xa9, 0x9a, 0xcf, 0xfc, 0xe0, 0xd3, 0x86, 0xb5, 0x2c, 0x1f, 0x4a, 0x79, 0xb8, 0x8b, 0xde, 0xed, 0x74, 0x47, 0x12, 0x21, 0x3d, 0xe, 0x5b, 0x68, 0xf1, 0xc2, 0x97, 0xa4, 0xaf, 0x9c, 0xc9, 0xfa, 0x63, 0x50, 0x5, 0x36, 0x2a, 0x19, 0x4c, 0x7f, 0xe6, 0xd5, 0x80, 0xb3, 0x96, 0xa5, 0xf0, 0xc3, 0x5a, 0x69, 0x3c, 0xf, 0x13, 0x20, 0x75, 0x46, 0xdf, 0xec, 0xb9, 0x8a, 0x81, 0xb2, 0xe7, 0xd4, 0x4d, 0x7e, 0x2b, 0x18, 0x4, 0x37, 0x62, 0x51, 0xc8, 0xfb, 0xae, 0x9d, 0xe4, 0xd7, 0x82, 0xb1, 0x28, 0x1b, 0x4e, 0x7d, 0x61, 0x52, 0x7, 0x34, 0xad, 0x9e, 0xcb, 0xf8, 0xf3, 0xc0, 0x95, 0xa6, 0x3f, 0xc, 0x59, 0x6a, 0x76, 0x45, 0x10, 0x23, 0xba, 0x89, 0xdc, 0xef, 0xca, 0xf9, 0xac, 0x9f, 0x6, 0x35, 0x60, 0x53, 0x4f, 0x7c, 0x29, 0x1a, 0x83, 0xb0, 0xe5, 0xd6, 0xdd, 0xee, 0xbb, 0x88, 0x11, 0x22, 0x77, 0x44, 0x58, 0x6b, 0x3e, 0xd, 0x94, 0xa7, 0xf2, 0xc1}, + {0x0, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x5, 0x31, 0x67, 0x53, 0xf, 0x3b, 0xb7, 0x83, 0xdf, 0xeb, 0xda, 0xee, 0xb2, 0x86, 0xa, 0x3e, 0x62, 0x56, 0xce, 0xfa, 0xa6, 0x92, 0x1e, 0x2a, 0x76, 0x42, 0x73, 0x47, 0x1b, 0x2f, 0xa3, 0x97, 0xcb, 0xff, 0xa9, 0x9d, 0xc1, 0xf5, 0x79, 0x4d, 0x11, 0x25, 0x14, 0x20, 0x7c, 0x48, 0xc4, 0xf0, 0xac, 0x98, 0x81, 0xb5, 0xe9, 0xdd, 0x51, 0x65, 0x39, 0xd, 0x3c, 0x8, 0x54, 0x60, 0xec, 0xd8, 0x84, 0xb0, 0xe6, 0xd2, 0x8e, 0xba, 0x36, 0x2, 0x5e, 0x6a, 0x5b, 0x6f, 0x33, 0x7, 0x8b, 0xbf, 0xe3, 0xd7, 0x4f, 0x7b, 0x27, 0x13, 0x9f, 0xab, 0xf7, 0xc3, 0xf2, 0xc6, 0x9a, 0xae, 0x22, 0x16, 0x4a, 0x7e, 0x28, 0x1c, 0x40, 0x74, 0xf8, 0xcc, 0x90, 0xa4, 0x95, 0xa1, 0xfd, 0xc9, 0x45, 0x71, 0x2d, 0x19, 0x1f, 0x2b, 0x77, 0x43, 0xcf, 0xfb, 0xa7, 0x93, 0xa2, 0x96, 0xca, 0xfe, 0x72, 0x46, 0x1a, 0x2e, 0x78, 0x4c, 0x10, 0x24, 0xa8, 0x9c, 0xc0, 0xf4, 0xc5, 0xf1, 0xad, 0x99, 0x15, 0x21, 0x7d, 0x49, 0xd1, 0xe5, 0xb9, 0x8d, 0x1, 0x35, 0x69, 0x5d, 0x6c, 0x58, 0x4, 0x30, 0xbc, 0x88, 0xd4, 0xe0, 0xb6, 0x82, 0xde, 0xea, 0x66, 0x52, 0xe, 0x3a, 0xb, 0x3f, 0x63, 0x57, 0xdb, 0xef, 0xb3, 0x87, 0x9e, 0xaa, 0xf6, 0xc2, 0x4e, 0x7a, 0x26, 0x12, 0x23, 0x17, 0x4b, 0x7f, 0xf3, 0xc7, 0x9b, 0xaf, 0xf9, 0xcd, 0x91, 0xa5, 0x29, 0x1d, 0x41, 0x75, 0x44, 0x70, 0x2c, 0x18, 0x94, 0xa0, 0xfc, 0xc8, 0x50, 0x64, 0x38, 0xc, 0x80, 0xb4, 0xe8, 0xdc, 0xed, 0xd9, 0x85, 0xb1, 0x3d, 0x9, 0x55, 0x61, 0x37, 0x3, 0x5f, 0x6b, 0xe7, 0xd3, 0x8f, 0xbb, 0x8a, 0xbe, 0xe2, 0xd6, 0x5a, 0x6e, 0x32, 0x6}, + {0x0, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0xb, 0x3e, 0x77, 0x42, 0x1d, 0x28, 0xa3, 0x96, 0xc9, 0xfc, 0xc2, 0xf7, 0xa8, 0x9d, 0x16, 0x23, 0x7c, 0x49, 0xee, 0xdb, 0x84, 0xb1, 0x3a, 0xf, 0x50, 0x65, 0x5b, 0x6e, 0x31, 0x4, 0x8f, 0xba, 0xe5, 0xd0, 0x99, 0xac, 0xf3, 0xc6, 0x4d, 0x78, 0x27, 0x12, 0x2c, 0x19, 0x46, 0x73, 0xf8, 0xcd, 0x92, 0xa7, 0xc1, 0xf4, 0xab, 0x9e, 0x15, 0x20, 0x7f, 0x4a, 0x74, 0x41, 0x1e, 0x2b, 0xa0, 0x95, 0xca, 0xff, 0xb6, 0x83, 0xdc, 0xe9, 0x62, 0x57, 0x8, 0x3d, 0x3, 0x36, 0x69, 0x5c, 0xd7, 0xe2, 0xbd, 0x88, 0x2f, 0x1a, 0x45, 0x70, 0xfb, 0xce, 0x91, 0xa4, 0x9a, 0xaf, 0xf0, 0xc5, 0x4e, 0x7b, 0x24, 0x11, 0x58, 0x6d, 0x32, 0x7, 0x8c, 0xb9, 0xe6, 0xd3, 0xed, 0xd8, 0x87, 0xb2, 0x39, 0xc, 0x53, 0x66, 0x9f, 0xaa, 0xf5, 0xc0, 0x4b, 0x7e, 0x21, 0x14, 0x2a, 0x1f, 0x40, 0x75, 0xfe, 0xcb, 0x94, 0xa1, 0xe8, 0xdd, 0x82, 0xb7, 0x3c, 0x9, 0x56, 0x63, 0x5d, 0x68, 0x37, 0x2, 0x89, 0xbc, 0xe3, 0xd6, 0x71, 0x44, 0x1b, 0x2e, 0xa5, 0x90, 0xcf, 0xfa, 0xc4, 0xf1, 0xae, 0x9b, 0x10, 0x25, 0x7a, 0x4f, 0x6, 0x33, 0x6c, 0x59, 0xd2, 0xe7, 0xb8, 0x8d, 0xb3, 0x86, 0xd9, 0xec, 0x67, 0x52, 0xd, 0x38, 0x5e, 0x6b, 0x34, 0x1, 0x8a, 0xbf, 0xe0, 0xd5, 0xeb, 0xde, 0x81, 0xb4, 0x3f, 0xa, 0x55, 0x60, 0x29, 0x1c, 0x43, 0x76, 0xfd, 0xc8, 0x97, 0xa2, 0x9c, 0xa9, 0xf6, 0xc3, 0x48, 0x7d, 0x22, 0x17, 0xb0, 0x85, 0xda, 0xef, 0x64, 0x51, 0xe, 0x3b, 0x5, 0x30, 0x6f, 0x5a, 0xd1, 0xe4, 0xbb, 0x8e, 0xc7, 0xf2, 0xad, 0x98, 0x13, 0x26, 0x79, 0x4c, 0x72, 0x47, 0x18, 0x2d, 0xa6, 0x93, 0xcc, 0xf9}, + {0x0, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f, 0x47, 0x71, 0x2b, 0x1d, 0x9f, 0xa9, 0xf3, 0xc5, 0xea, 0xdc, 0x86, 0xb0, 0x32, 0x4, 0x5e, 0x68, 0x8e, 0xb8, 0xe2, 0xd4, 0x56, 0x60, 0x3a, 0xc, 0x23, 0x15, 0x4f, 0x79, 0xfb, 0xcd, 0x97, 0xa1, 0xc9, 0xff, 0xa5, 0x93, 0x11, 0x27, 0x7d, 0x4b, 0x64, 0x52, 0x8, 0x3e, 0xbc, 0x8a, 0xd0, 0xe6, 0x1, 0x37, 0x6d, 0x5b, 0xd9, 0xef, 0xb5, 0x83, 0xac, 0x9a, 0xc0, 0xf6, 0x74, 0x42, 0x18, 0x2e, 0x46, 0x70, 0x2a, 0x1c, 0x9e, 0xa8, 0xf2, 0xc4, 0xeb, 0xdd, 0x87, 0xb1, 0x33, 0x5, 0x5f, 0x69, 0x8f, 0xb9, 0xe3, 0xd5, 0x57, 0x61, 0x3b, 0xd, 0x22, 0x14, 0x4e, 0x78, 0xfa, 0xcc, 0x96, 0xa0, 0xc8, 0xfe, 0xa4, 0x92, 0x10, 0x26, 0x7c, 0x4a, 0x65, 0x53, 0x9, 0x3f, 0xbd, 0x8b, 0xd1, 0xe7, 0x2, 0x34, 0x6e, 0x58, 0xda, 0xec, 0xb6, 0x80, 0xaf, 0x99, 0xc3, 0xf5, 0x77, 0x41, 0x1b, 0x2d, 0x45, 0x73, 0x29, 0x1f, 0x9d, 0xab, 0xf1, 0xc7, 0xe8, 0xde, 0x84, 0xb2, 0x30, 0x6, 0x5c, 0x6a, 0x8c, 0xba, 0xe0, 0xd6, 0x54, 0x62, 0x38, 0xe, 0x21, 0x17, 0x4d, 0x7b, 0xf9, 0xcf, 0x95, 0xa3, 0xcb, 0xfd, 0xa7, 0x91, 0x13, 0x25, 0x7f, 0x49, 0x66, 0x50, 0xa, 0x3c, 0xbe, 0x88, 0xd2, 0xe4, 0x3, 0x35, 0x6f, 0x59, 0xdb, 0xed, 0xb7, 0x81, 0xae, 0x98, 0xc2, 0xf4, 0x76, 0x40, 0x1a, 0x2c, 0x44, 0x72, 0x28, 0x1e, 0x9c, 0xaa, 0xf0, 0xc6, 0xe9, 0xdf, 0x85, 0xb3, 0x31, 0x7, 0x5d, 0x6b, 0x8d, 0xbb, 0xe1, 0xd7, 0x55, 0x63, 0x39, 0xf, 0x20, 0x16, 0x4c, 0x7a, 0xf8, 0xce, 0x94, 0xa2, 0xca, 0xfc, 0xa6, 0x90, 0x12, 0x24, 0x7e, 0x48, 0x67, 0x51, 0xb, 0x3d, 0xbf, 0x89, 0xd3, 0xe5}, + {0x0, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20, 0x57, 0x60, 0x39, 0xe, 0x8b, 0xbc, 0xe5, 0xd2, 0xf2, 0xc5, 0x9c, 0xab, 0x2e, 0x19, 0x40, 0x77, 0xae, 0x99, 0xc0, 0xf7, 0x72, 0x45, 0x1c, 0x2b, 0xb, 0x3c, 0x65, 0x52, 0xd7, 0xe0, 0xb9, 0x8e, 0xf9, 0xce, 0x97, 0xa0, 0x25, 0x12, 0x4b, 0x7c, 0x5c, 0x6b, 0x32, 0x5, 0x80, 0xb7, 0xee, 0xd9, 0x41, 0x76, 0x2f, 0x18, 0x9d, 0xaa, 0xf3, 0xc4, 0xe4, 0xd3, 0x8a, 0xbd, 0x38, 0xf, 0x56, 0x61, 0x16, 0x21, 0x78, 0x4f, 0xca, 0xfd, 0xa4, 0x93, 0xb3, 0x84, 0xdd, 0xea, 0x6f, 0x58, 0x1, 0x36, 0xef, 0xd8, 0x81, 0xb6, 0x33, 0x4, 0x5d, 0x6a, 0x4a, 0x7d, 0x24, 0x13, 0x96, 0xa1, 0xf8, 0xcf, 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0xa, 0x3d, 0x1d, 0x2a, 0x73, 0x44, 0xc1, 0xf6, 0xaf, 0x98, 0x82, 0xb5, 0xec, 0xdb, 0x5e, 0x69, 0x30, 0x7, 0x27, 0x10, 0x49, 0x7e, 0xfb, 0xcc, 0x95, 0xa2, 0xd5, 0xe2, 0xbb, 0x8c, 0x9, 0x3e, 0x67, 0x50, 0x70, 0x47, 0x1e, 0x29, 0xac, 0x9b, 0xc2, 0xf5, 0x2c, 0x1b, 0x42, 0x75, 0xf0, 0xc7, 0x9e, 0xa9, 0x89, 0xbe, 0xe7, 0xd0, 0x55, 0x62, 0x3b, 0xc, 0x7b, 0x4c, 0x15, 0x22, 0xa7, 0x90, 0xc9, 0xfe, 0xde, 0xe9, 0xb0, 0x87, 0x2, 0x35, 0x6c, 0x5b, 0xc3, 0xf4, 0xad, 0x9a, 0x1f, 0x28, 0x71, 0x46, 0x66, 0x51, 0x8, 0x3f, 0xba, 0x8d, 0xd4, 0xe3, 0x94, 0xa3, 0xfa, 0xcd, 0x48, 0x7f, 0x26, 0x11, 0x31, 0x6, 0x5f, 0x68, 0xed, 0xda, 0x83, 0xb4, 0x6d, 0x5a, 0x3, 0x34, 0xb1, 0x86, 0xdf, 0xe8, 0xc8, 0xff, 0xa6, 0x91, 0x14, 0x23, 0x7a, 0x4d, 0x3a, 0xd, 0x54, 0x63, 0xe6, 0xd1, 0x88, 0xbf, 0x9f, 0xa8, 0xf1, 0xc6, 0x43, 0x74, 0x2d, 0x1a}, + {0x0, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x5, 0x4d, 0x75, 0xa7, 0x9f, 0xd7, 0xef, 0x47, 0x7f, 0x37, 0xf, 0x7a, 0x42, 0xa, 0x32, 0x9a, 0xa2, 0xea, 0xd2, 0x53, 0x6b, 0x23, 0x1b, 0xb3, 0x8b, 0xc3, 0xfb, 0x8e, 0xb6, 0xfe, 0xc6, 0x6e, 0x56, 0x1e, 0x26, 0xf4, 0xcc, 0x84, 0xbc, 0x14, 0x2c, 0x64, 0x5c, 0x29, 0x11, 0x59, 0x61, 0xc9, 0xf1, 0xb9, 0x81, 0xa6, 0x9e, 0xd6, 0xee, 0x46, 0x7e, 0x36, 0xe, 0x7b, 0x43, 0xb, 0x33, 0x9b, 0xa3, 0xeb, 0xd3, 0x1, 0x39, 0x71, 0x49, 0xe1, 0xd9, 0x91, 0xa9, 0xdc, 0xe4, 0xac, 0x94, 0x3c, 0x4, 0x4c, 0x74, 0xf5, 0xcd, 0x85, 0xbd, 0x15, 0x2d, 0x65, 0x5d, 0x28, 0x10, 0x58, 0x60, 0xc8, 0xf0, 0xb8, 0x80, 0x52, 0x6a, 0x22, 0x1a, 0xb2, 0x8a, 0xc2, 0xfa, 0x8f, 0xb7, 0xff, 0xc7, 0x6f, 0x57, 0x1f, 0x27, 0x51, 0x69, 0x21, 0x19, 0xb1, 0x89, 0xc1, 0xf9, 0x8c, 0xb4, 0xfc, 0xc4, 0x6c, 0x54, 0x1c, 0x24, 0xf6, 0xce, 0x86, 0xbe, 0x16, 0x2e, 0x66, 0x5e, 0x2b, 0x13, 0x5b, 0x63, 0xcb, 0xf3, 0xbb, 0x83, 0x2, 0x3a, 0x72, 0x4a, 0xe2, 0xda, 0x92, 0xaa, 0xdf, 0xe7, 0xaf, 0x97, 0x3f, 0x7, 0x4f, 0x77, 0xa5, 0x9d, 0xd5, 0xed, 0x45, 0x7d, 0x35, 0xd, 0x78, 0x40, 0x8, 0x30, 0x98, 0xa0, 0xe8, 0xd0, 0xf7, 0xcf, 0x87, 0xbf, 0x17, 0x2f, 0x67, 0x5f, 0x2a, 0x12, 0x5a, 0x62, 0xca, 0xf2, 0xba, 0x82, 0x50, 0x68, 0x20, 0x18, 0xb0, 0x88, 0xc0, 0xf8, 0x8d, 0xb5, 0xfd, 0xc5, 0x6d, 0x55, 0x1d, 0x25, 0xa4, 0x9c, 0xd4, 0xec, 0x44, 0x7c, 0x34, 0xc, 0x79, 0x41, 0x9, 0x31, 0x99, 0xa1, 0xe9, 0xd1, 0x3, 0x3b, 0x73, 0x4b, 0xe3, 0xdb, 0x93, 0xab, 0xde, 0xe6, 0xae, 0x96, 0x3e, 0x6, 0x4e, 0x76}, + {0x0, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x8, 0x43, 0x7a, 0xb7, 0x8e, 0xc5, 0xfc, 0x53, 0x6a, 0x21, 0x18, 0x62, 0x5b, 0x10, 0x29, 0x86, 0xbf, 0xf4, 0xcd, 0x73, 0x4a, 0x1, 0x38, 0x97, 0xae, 0xe5, 0xdc, 0xa6, 0x9f, 0xd4, 0xed, 0x42, 0x7b, 0x30, 0x9, 0xc4, 0xfd, 0xb6, 0x8f, 0x20, 0x19, 0x52, 0x6b, 0x11, 0x28, 0x63, 0x5a, 0xf5, 0xcc, 0x87, 0xbe, 0xe6, 0xdf, 0x94, 0xad, 0x2, 0x3b, 0x70, 0x49, 0x33, 0xa, 0x41, 0x78, 0xd7, 0xee, 0xa5, 0x9c, 0x51, 0x68, 0x23, 0x1a, 0xb5, 0x8c, 0xc7, 0xfe, 0x84, 0xbd, 0xf6, 0xcf, 0x60, 0x59, 0x12, 0x2b, 0x95, 0xac, 0xe7, 0xde, 0x71, 0x48, 0x3, 0x3a, 0x40, 0x79, 0x32, 0xb, 0xa4, 0x9d, 0xd6, 0xef, 0x22, 0x1b, 0x50, 0x69, 0xc6, 0xff, 0xb4, 0x8d, 0xf7, 0xce, 0x85, 0xbc, 0x13, 0x2a, 0x61, 0x58, 0xd1, 0xe8, 0xa3, 0x9a, 0x35, 0xc, 0x47, 0x7e, 0x4, 0x3d, 0x76, 0x4f, 0xe0, 0xd9, 0x92, 0xab, 0x66, 0x5f, 0x14, 0x2d, 0x82, 0xbb, 0xf0, 0xc9, 0xb3, 0x8a, 0xc1, 0xf8, 0x57, 0x6e, 0x25, 0x1c, 0xa2, 0x9b, 0xd0, 0xe9, 0x46, 0x7f, 0x34, 0xd, 0x77, 0x4e, 0x5, 0x3c, 0x93, 0xaa, 0xe1, 0xd8, 0x15, 0x2c, 0x67, 0x5e, 0xf1, 0xc8, 0x83, 0xba, 0xc0, 0xf9, 0xb2, 0x8b, 0x24, 0x1d, 0x56, 0x6f, 0x37, 0xe, 0x45, 0x7c, 0xd3, 0xea, 0xa1, 0x98, 0xe2, 0xdb, 0x90, 0xa9, 0x6, 0x3f, 0x74, 0x4d, 0x80, 0xb9, 0xf2, 0xcb, 0x64, 0x5d, 0x16, 0x2f, 0x55, 0x6c, 0x27, 0x1e, 0xb1, 0x88, 0xc3, 0xfa, 0x44, 0x7d, 0x36, 0xf, 0xa0, 0x99, 0xd2, 0xeb, 0x91, 0xa8, 0xe3, 0xda, 0x75, 0x4c, 0x7, 0x3e, 0xf3, 0xca, 0x81, 0xb8, 0x17, 0x2e, 0x65, 0x5c, 0x26, 0x1f, 0x54, 0x6d, 0xc2, 0xfb, 0xb0, 0x89}, + {0x0, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b, 0x87, 0xbd, 0xf3, 0xc9, 0x6f, 0x55, 0x1b, 0x21, 0x4a, 0x70, 0x3e, 0x4, 0xa2, 0x98, 0xd6, 0xec, 0x13, 0x29, 0x67, 0x5d, 0xfb, 0xc1, 0x8f, 0xb5, 0xde, 0xe4, 0xaa, 0x90, 0x36, 0xc, 0x42, 0x78, 0x94, 0xae, 0xe0, 0xda, 0x7c, 0x46, 0x8, 0x32, 0x59, 0x63, 0x2d, 0x17, 0xb1, 0x8b, 0xc5, 0xff, 0x26, 0x1c, 0x52, 0x68, 0xce, 0xf4, 0xba, 0x80, 0xeb, 0xd1, 0x9f, 0xa5, 0x3, 0x39, 0x77, 0x4d, 0xa1, 0x9b, 0xd5, 0xef, 0x49, 0x73, 0x3d, 0x7, 0x6c, 0x56, 0x18, 0x22, 0x84, 0xbe, 0xf0, 0xca, 0x35, 0xf, 0x41, 0x7b, 0xdd, 0xe7, 0xa9, 0x93, 0xf8, 0xc2, 0x8c, 0xb6, 0x10, 0x2a, 0x64, 0x5e, 0xb2, 0x88, 0xc6, 0xfc, 0x5a, 0x60, 0x2e, 0x14, 0x7f, 0x45, 0xb, 0x31, 0x97, 0xad, 0xe3, 0xd9, 0x4c, 0x76, 0x38, 0x2, 0xa4, 0x9e, 0xd0, 0xea, 0x81, 0xbb, 0xf5, 0xcf, 0x69, 0x53, 0x1d, 0x27, 0xcb, 0xf1, 0xbf, 0x85, 0x23, 0x19, 0x57, 0x6d, 0x6, 0x3c, 0x72, 0x48, 0xee, 0xd4, 0x9a, 0xa0, 0x5f, 0x65, 0x2b, 0x11, 0xb7, 0x8d, 0xc3, 0xf9, 0x92, 0xa8, 0xe6, 0xdc, 0x7a, 0x40, 0xe, 0x34, 0xd8, 0xe2, 0xac, 0x96, 0x30, 0xa, 0x44, 0x7e, 0x15, 0x2f, 0x61, 0x5b, 0xfd, 0xc7, 0x89, 0xb3, 0x6a, 0x50, 0x1e, 0x24, 0x82, 0xb8, 0xf6, 0xcc, 0xa7, 0x9d, 0xd3, 0xe9, 0x4f, 0x75, 0x3b, 0x1, 0xed, 0xd7, 0x99, 0xa3, 0x5, 0x3f, 0x71, 0x4b, 0x20, 0x1a, 0x54, 0x6e, 0xc8, 0xf2, 0xbc, 0x86, 0x79, 0x43, 0xd, 0x37, 0x91, 0xab, 0xe5, 0xdf, 0xb4, 0x8e, 0xc0, 0xfa, 0x5c, 0x66, 0x28, 0x12, 0xfe, 0xc4, 0x8a, 0xb0, 0x16, 0x2c, 0x62, 0x58, 0x33, 0x9, 0x47, 0x7d, 0xdb, 0xe1, 0xaf, 0x95}, + {0x0, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64, 0x97, 0xac, 0xe1, 0xda, 0x7b, 0x40, 0xd, 0x36, 0x52, 0x69, 0x24, 0x1f, 0xbe, 0x85, 0xc8, 0xf3, 0x33, 0x8, 0x45, 0x7e, 0xdf, 0xe4, 0xa9, 0x92, 0xf6, 0xcd, 0x80, 0xbb, 0x1a, 0x21, 0x6c, 0x57, 0xa4, 0x9f, 0xd2, 0xe9, 0x48, 0x73, 0x3e, 0x5, 0x61, 0x5a, 0x17, 0x2c, 0x8d, 0xb6, 0xfb, 0xc0, 0x66, 0x5d, 0x10, 0x2b, 0x8a, 0xb1, 0xfc, 0xc7, 0xa3, 0x98, 0xd5, 0xee, 0x4f, 0x74, 0x39, 0x2, 0xf1, 0xca, 0x87, 0xbc, 0x1d, 0x26, 0x6b, 0x50, 0x34, 0xf, 0x42, 0x79, 0xd8, 0xe3, 0xae, 0x95, 0x55, 0x6e, 0x23, 0x18, 0xb9, 0x82, 0xcf, 0xf4, 0x90, 0xab, 0xe6, 0xdd, 0x7c, 0x47, 0xa, 0x31, 0xc2, 0xf9, 0xb4, 0x8f, 0x2e, 0x15, 0x58, 0x63, 0x7, 0x3c, 0x71, 0x4a, 0xeb, 0xd0, 0x9d, 0xa6, 0xcc, 0xf7, 0xba, 0x81, 0x20, 0x1b, 0x56, 0x6d, 0x9, 0x32, 0x7f, 0x44, 0xe5, 0xde, 0x93, 0xa8, 0x5b, 0x60, 0x2d, 0x16, 0xb7, 0x8c, 0xc1, 0xfa, 0x9e, 0xa5, 0xe8, 0xd3, 0x72, 0x49, 0x4, 0x3f, 0xff, 0xc4, 0x89, 0xb2, 0x13, 0x28, 0x65, 0x5e, 0x3a, 0x1, 0x4c, 0x77, 0xd6, 0xed, 0xa0, 0x9b, 0x68, 0x53, 0x1e, 0x25, 0x84, 0xbf, 0xf2, 0xc9, 0xad, 0x96, 0xdb, 0xe0, 0x41, 0x7a, 0x37, 0xc, 0xaa, 0x91, 0xdc, 0xe7, 0x46, 0x7d, 0x30, 0xb, 0x6f, 0x54, 0x19, 0x22, 0x83, 0xb8, 0xf5, 0xce, 0x3d, 0x6, 0x4b, 0x70, 0xd1, 0xea, 0xa7, 0x9c, 0xf8, 0xc3, 0x8e, 0xb5, 0x14, 0x2f, 0x62, 0x59, 0x99, 0xa2, 0xef, 0xd4, 0x75, 0x4e, 0x3, 0x38, 0x5c, 0x67, 0x2a, 0x11, 0xb0, 0x8b, 0xc6, 0xfd, 0xe, 0x35, 0x78, 0x43, 0xe2, 0xd9, 0x94, 0xaf, 0xcb, 0xf0, 0xbd, 0x86, 0x27, 0x1c, 0x51, 0x6a}, + {0x0, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0xd, 0x31, 0x75, 0x49, 0xe7, 0xdb, 0x9f, 0xa3, 0x17, 0x2b, 0x6f, 0x53, 0x1a, 0x26, 0x62, 0x5e, 0xea, 0xd6, 0x92, 0xae, 0xd3, 0xef, 0xab, 0x97, 0x23, 0x1f, 0x5b, 0x67, 0x2e, 0x12, 0x56, 0x6a, 0xde, 0xe2, 0xa6, 0x9a, 0x34, 0x8, 0x4c, 0x70, 0xc4, 0xf8, 0xbc, 0x80, 0xc9, 0xf5, 0xb1, 0x8d, 0x39, 0x5, 0x41, 0x7d, 0xbb, 0x87, 0xc3, 0xff, 0x4b, 0x77, 0x33, 0xf, 0x46, 0x7a, 0x3e, 0x2, 0xb6, 0x8a, 0xce, 0xf2, 0x5c, 0x60, 0x24, 0x18, 0xac, 0x90, 0xd4, 0xe8, 0xa1, 0x9d, 0xd9, 0xe5, 0x51, 0x6d, 0x29, 0x15, 0x68, 0x54, 0x10, 0x2c, 0x98, 0xa4, 0xe0, 0xdc, 0x95, 0xa9, 0xed, 0xd1, 0x65, 0x59, 0x1d, 0x21, 0x8f, 0xb3, 0xf7, 0xcb, 0x7f, 0x43, 0x7, 0x3b, 0x72, 0x4e, 0xa, 0x36, 0x82, 0xbe, 0xfa, 0xc6, 0x6b, 0x57, 0x13, 0x2f, 0x9b, 0xa7, 0xe3, 0xdf, 0x96, 0xaa, 0xee, 0xd2, 0x66, 0x5a, 0x1e, 0x22, 0x8c, 0xb0, 0xf4, 0xc8, 0x7c, 0x40, 0x4, 0x38, 0x71, 0x4d, 0x9, 0x35, 0x81, 0xbd, 0xf9, 0xc5, 0xb8, 0x84, 0xc0, 0xfc, 0x48, 0x74, 0x30, 0xc, 0x45, 0x79, 0x3d, 0x1, 0xb5, 0x89, 0xcd, 0xf1, 0x5f, 0x63, 0x27, 0x1b, 0xaf, 0x93, 0xd7, 0xeb, 0xa2, 0x9e, 0xda, 0xe6, 0x52, 0x6e, 0x2a, 0x16, 0xd0, 0xec, 0xa8, 0x94, 0x20, 0x1c, 0x58, 0x64, 0x2d, 0x11, 0x55, 0x69, 0xdd, 0xe1, 0xa5, 0x99, 0x37, 0xb, 0x4f, 0x73, 0xc7, 0xfb, 0xbf, 0x83, 0xca, 0xf6, 0xb2, 0x8e, 0x3a, 0x6, 0x42, 0x7e, 0x3, 0x3f, 0x7b, 0x47, 0xf3, 0xcf, 0x8b, 0xb7, 0xfe, 0xc2, 0x86, 0xba, 0xe, 0x32, 0x76, 0x4a, 0xe4, 0xd8, 0x9c, 0xa0, 0x14, 0x28, 0x6c, 0x50, 0x19, 0x25, 0x61, 0x5d, 0xe9, 0xd5, 0x91, 0xad}, + {0x0, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x1, 0x3c, 0x7b, 0x46, 0xf7, 0xca, 0x8d, 0xb0, 0x3, 0x3e, 0x79, 0x44, 0x2, 0x3f, 0x78, 0x45, 0xf6, 0xcb, 0x8c, 0xb1, 0xf3, 0xce, 0x89, 0xb4, 0x7, 0x3a, 0x7d, 0x40, 0x6, 0x3b, 0x7c, 0x41, 0xf2, 0xcf, 0x88, 0xb5, 0x4, 0x39, 0x7e, 0x43, 0xf0, 0xcd, 0x8a, 0xb7, 0xf1, 0xcc, 0x8b, 0xb6, 0x5, 0x38, 0x7f, 0x42, 0xfb, 0xc6, 0x81, 0xbc, 0xf, 0x32, 0x75, 0x48, 0xe, 0x33, 0x74, 0x49, 0xfa, 0xc7, 0x80, 0xbd, 0xc, 0x31, 0x76, 0x4b, 0xf8, 0xc5, 0x82, 0xbf, 0xf9, 0xc4, 0x83, 0xbe, 0xd, 0x30, 0x77, 0x4a, 0x8, 0x35, 0x72, 0x4f, 0xfc, 0xc1, 0x86, 0xbb, 0xfd, 0xc0, 0x87, 0xba, 0x9, 0x34, 0x73, 0x4e, 0xff, 0xc2, 0x85, 0xb8, 0xb, 0x36, 0x71, 0x4c, 0xa, 0x37, 0x70, 0x4d, 0xfe, 0xc3, 0x84, 0xb9, 0xeb, 0xd6, 0x91, 0xac, 0x1f, 0x22, 0x65, 0x58, 0x1e, 0x23, 0x64, 0x59, 0xea, 0xd7, 0x90, 0xad, 0x1c, 0x21, 0x66, 0x5b, 0xe8, 0xd5, 0x92, 0xaf, 0xe9, 0xd4, 0x93, 0xae, 0x1d, 0x20, 0x67, 0x5a, 0x18, 0x25, 0x62, 0x5f, 0xec, 0xd1, 0x96, 0xab, 0xed, 0xd0, 0x97, 0xaa, 0x19, 0x24, 0x63, 0x5e, 0xef, 0xd2, 0x95, 0xa8, 0x1b, 0x26, 0x61, 0x5c, 0x1a, 0x27, 0x60, 0x5d, 0xee, 0xd3, 0x94, 0xa9, 0x10, 0x2d, 0x6a, 0x57, 0xe4, 0xd9, 0x9e, 0xa3, 0xe5, 0xd8, 0x9f, 0xa2, 0x11, 0x2c, 0x6b, 0x56, 0xe7, 0xda, 0x9d, 0xa0, 0x13, 0x2e, 0x69, 0x54, 0x12, 0x2f, 0x68, 0x55, 0xe6, 0xdb, 0x9c, 0xa1, 0xe3, 0xde, 0x99, 0xa4, 0x17, 0x2a, 0x6d, 0x50, 0x16, 0x2b, 0x6c, 0x51, 0xe2, 0xdf, 0x98, 0xa5, 0x14, 0x29, 0x6e, 0x53, 0xe0, 0xdd, 0x9a, 0xa7, 0xe1, 0xdc, 0x9b, 0xa6, 0x15, 0x28, 0x6f, 0x52}, + {0x0, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57, 0xc7, 0xf9, 0xbb, 0x85, 0x3f, 0x1, 0x43, 0x7d, 0x2a, 0x14, 0x56, 0x68, 0xd2, 0xec, 0xae, 0x90, 0x93, 0xad, 0xef, 0xd1, 0x6b, 0x55, 0x17, 0x29, 0x7e, 0x40, 0x2, 0x3c, 0x86, 0xb8, 0xfa, 0xc4, 0x54, 0x6a, 0x28, 0x16, 0xac, 0x92, 0xd0, 0xee, 0xb9, 0x87, 0xc5, 0xfb, 0x41, 0x7f, 0x3d, 0x3, 0x3b, 0x5, 0x47, 0x79, 0xc3, 0xfd, 0xbf, 0x81, 0xd6, 0xe8, 0xaa, 0x94, 0x2e, 0x10, 0x52, 0x6c, 0xfc, 0xc2, 0x80, 0xbe, 0x4, 0x3a, 0x78, 0x46, 0x11, 0x2f, 0x6d, 0x53, 0xe9, 0xd7, 0x95, 0xab, 0xa8, 0x96, 0xd4, 0xea, 0x50, 0x6e, 0x2c, 0x12, 0x45, 0x7b, 0x39, 0x7, 0xbd, 0x83, 0xc1, 0xff, 0x6f, 0x51, 0x13, 0x2d, 0x97, 0xa9, 0xeb, 0xd5, 0x82, 0xbc, 0xfe, 0xc0, 0x7a, 0x44, 0x6, 0x38, 0x76, 0x48, 0xa, 0x34, 0x8e, 0xb0, 0xf2, 0xcc, 0x9b, 0xa5, 0xe7, 0xd9, 0x63, 0x5d, 0x1f, 0x21, 0xb1, 0x8f, 0xcd, 0xf3, 0x49, 0x77, 0x35, 0xb, 0x5c, 0x62, 0x20, 0x1e, 0xa4, 0x9a, 0xd8, 0xe6, 0xe5, 0xdb, 0x99, 0xa7, 0x1d, 0x23, 0x61, 0x5f, 0x8, 0x36, 0x74, 0x4a, 0xf0, 0xce, 0x8c, 0xb2, 0x22, 0x1c, 0x5e, 0x60, 0xda, 0xe4, 0xa6, 0x98, 0xcf, 0xf1, 0xb3, 0x8d, 0x37, 0x9, 0x4b, 0x75, 0x4d, 0x73, 0x31, 0xf, 0xb5, 0x8b, 0xc9, 0xf7, 0xa0, 0x9e, 0xdc, 0xe2, 0x58, 0x66, 0x24, 0x1a, 0x8a, 0xb4, 0xf6, 0xc8, 0x72, 0x4c, 0xe, 0x30, 0x67, 0x59, 0x1b, 0x25, 0x9f, 0xa1, 0xe3, 0xdd, 0xde, 0xe0, 0xa2, 0x9c, 0x26, 0x18, 0x5a, 0x64, 0x33, 0xd, 0x4f, 0x71, 0xcb, 0xf5, 0xb7, 0x89, 0x19, 0x27, 0x65, 0x5b, 0xe1, 0xdf, 0x9d, 0xa3, 0xf4, 0xca, 0x88, 0xb6, 0xc, 0x32, 0x70, 0x4e}, + {0x0, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58, 0xd7, 0xe8, 0xa9, 0x96, 0x2b, 0x14, 0x55, 0x6a, 0x32, 0xd, 0x4c, 0x73, 0xce, 0xf1, 0xb0, 0x8f, 0xb3, 0x8c, 0xcd, 0xf2, 0x4f, 0x70, 0x31, 0xe, 0x56, 0x69, 0x28, 0x17, 0xaa, 0x95, 0xd4, 0xeb, 0x64, 0x5b, 0x1a, 0x25, 0x98, 0xa7, 0xe6, 0xd9, 0x81, 0xbe, 0xff, 0xc0, 0x7d, 0x42, 0x3, 0x3c, 0x7b, 0x44, 0x5, 0x3a, 0x87, 0xb8, 0xf9, 0xc6, 0x9e, 0xa1, 0xe0, 0xdf, 0x62, 0x5d, 0x1c, 0x23, 0xac, 0x93, 0xd2, 0xed, 0x50, 0x6f, 0x2e, 0x11, 0x49, 0x76, 0x37, 0x8, 0xb5, 0x8a, 0xcb, 0xf4, 0xc8, 0xf7, 0xb6, 0x89, 0x34, 0xb, 0x4a, 0x75, 0x2d, 0x12, 0x53, 0x6c, 0xd1, 0xee, 0xaf, 0x90, 0x1f, 0x20, 0x61, 0x5e, 0xe3, 0xdc, 0x9d, 0xa2, 0xfa, 0xc5, 0x84, 0xbb, 0x6, 0x39, 0x78, 0x47, 0xf6, 0xc9, 0x88, 0xb7, 0xa, 0x35, 0x74, 0x4b, 0x13, 0x2c, 0x6d, 0x52, 0xef, 0xd0, 0x91, 0xae, 0x21, 0x1e, 0x5f, 0x60, 0xdd, 0xe2, 0xa3, 0x9c, 0xc4, 0xfb, 0xba, 0x85, 0x38, 0x7, 0x46, 0x79, 0x45, 0x7a, 0x3b, 0x4, 0xb9, 0x86, 0xc7, 0xf8, 0xa0, 0x9f, 0xde, 0xe1, 0x5c, 0x63, 0x22, 0x1d, 0x92, 0xad, 0xec, 0xd3, 0x6e, 0x51, 0x10, 0x2f, 0x77, 0x48, 0x9, 0x36, 0x8b, 0xb4, 0xf5, 0xca, 0x8d, 0xb2, 0xf3, 0xcc, 0x71, 0x4e, 0xf, 0x30, 0x68, 0x57, 0x16, 0x29, 0x94, 0xab, 0xea, 0xd5, 0x5a, 0x65, 0x24, 0x1b, 0xa6, 0x99, 0xd8, 0xe7, 0xbf, 0x80, 0xc1, 0xfe, 0x43, 0x7c, 0x3d, 0x2, 0x3e, 0x1, 0x40, 0x7f, 0xc2, 0xfd, 0xbc, 0x83, 0xdb, 0xe4, 0xa5, 0x9a, 0x27, 0x18, 0x59, 0x66, 0xe9, 0xd6, 0x97, 0xa8, 0x15, 0x2a, 0x6b, 0x54, 0xc, 0x33, 0x72, 0x4d, 0xf0, 0xcf, 0x8e, 0xb1}, + {0x0, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7, 0x74, 0x34, 0xf4, 0xb4, 0x69, 0x29, 0xe9, 0xa9, 0x4e, 0xe, 0xce, 0x8e, 0x53, 0x13, 0xd3, 0x93, 0xe8, 0xa8, 0x68, 0x28, 0xf5, 0xb5, 0x75, 0x35, 0xd2, 0x92, 0x52, 0x12, 0xcf, 0x8f, 0x4f, 0xf, 0x9c, 0xdc, 0x1c, 0x5c, 0x81, 0xc1, 0x1, 0x41, 0xa6, 0xe6, 0x26, 0x66, 0xbb, 0xfb, 0x3b, 0x7b, 0xcd, 0x8d, 0x4d, 0xd, 0xd0, 0x90, 0x50, 0x10, 0xf7, 0xb7, 0x77, 0x37, 0xea, 0xaa, 0x6a, 0x2a, 0xb9, 0xf9, 0x39, 0x79, 0xa4, 0xe4, 0x24, 0x64, 0x83, 0xc3, 0x3, 0x43, 0x9e, 0xde, 0x1e, 0x5e, 0x25, 0x65, 0xa5, 0xe5, 0x38, 0x78, 0xb8, 0xf8, 0x1f, 0x5f, 0x9f, 0xdf, 0x2, 0x42, 0x82, 0xc2, 0x51, 0x11, 0xd1, 0x91, 0x4c, 0xc, 0xcc, 0x8c, 0x6b, 0x2b, 0xeb, 0xab, 0x76, 0x36, 0xf6, 0xb6, 0x87, 0xc7, 0x7, 0x47, 0x9a, 0xda, 0x1a, 0x5a, 0xbd, 0xfd, 0x3d, 0x7d, 0xa0, 0xe0, 0x20, 0x60, 0xf3, 0xb3, 0x73, 0x33, 0xee, 0xae, 0x6e, 0x2e, 0xc9, 0x89, 0x49, 0x9, 0xd4, 0x94, 0x54, 0x14, 0x6f, 0x2f, 0xef, 0xaf, 0x72, 0x32, 0xf2, 0xb2, 0x55, 0x15, 0xd5, 0x95, 0x48, 0x8, 0xc8, 0x88, 0x1b, 0x5b, 0x9b, 0xdb, 0x6, 0x46, 0x86, 0xc6, 0x21, 0x61, 0xa1, 0xe1, 0x3c, 0x7c, 0xbc, 0xfc, 0x4a, 0xa, 0xca, 0x8a, 0x57, 0x17, 0xd7, 0x97, 0x70, 0x30, 0xf0, 0xb0, 0x6d, 0x2d, 0xed, 0xad, 0x3e, 0x7e, 0xbe, 0xfe, 0x23, 0x63, 0xa3, 0xe3, 0x4, 0x44, 0x84, 0xc4, 0x19, 0x59, 0x99, 0xd9, 0xa2, 0xe2, 0x22, 0x62, 0xbf, 0xff, 0x3f, 0x7f, 0x98, 0xd8, 0x18, 0x58, 0x85, 0xc5, 0x5, 0x45, 0xd6, 0x96, 0x56, 0x16, 0xcb, 0x8b, 0x4b, 0xb, 0xec, 0xac, 0x6c, 0x2c, 0xf1, 0xb1, 0x71, 0x31}, + {0x0, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8, 0x64, 0x25, 0xe6, 0xa7, 0x7d, 0x3c, 0xff, 0xbe, 0x56, 0x17, 0xd4, 0x95, 0x4f, 0xe, 0xcd, 0x8c, 0xc8, 0x89, 0x4a, 0xb, 0xd1, 0x90, 0x53, 0x12, 0xfa, 0xbb, 0x78, 0x39, 0xe3, 0xa2, 0x61, 0x20, 0xac, 0xed, 0x2e, 0x6f, 0xb5, 0xf4, 0x37, 0x76, 0x9e, 0xdf, 0x1c, 0x5d, 0x87, 0xc6, 0x5, 0x44, 0x8d, 0xcc, 0xf, 0x4e, 0x94, 0xd5, 0x16, 0x57, 0xbf, 0xfe, 0x3d, 0x7c, 0xa6, 0xe7, 0x24, 0x65, 0xe9, 0xa8, 0x6b, 0x2a, 0xf0, 0xb1, 0x72, 0x33, 0xdb, 0x9a, 0x59, 0x18, 0xc2, 0x83, 0x40, 0x1, 0x45, 0x4, 0xc7, 0x86, 0x5c, 0x1d, 0xde, 0x9f, 0x77, 0x36, 0xf5, 0xb4, 0x6e, 0x2f, 0xec, 0xad, 0x21, 0x60, 0xa3, 0xe2, 0x38, 0x79, 0xba, 0xfb, 0x13, 0x52, 0x91, 0xd0, 0xa, 0x4b, 0x88, 0xc9, 0x7, 0x46, 0x85, 0xc4, 0x1e, 0x5f, 0x9c, 0xdd, 0x35, 0x74, 0xb7, 0xf6, 0x2c, 0x6d, 0xae, 0xef, 0x63, 0x22, 0xe1, 0xa0, 0x7a, 0x3b, 0xf8, 0xb9, 0x51, 0x10, 0xd3, 0x92, 0x48, 0x9, 0xca, 0x8b, 0xcf, 0x8e, 0x4d, 0xc, 0xd6, 0x97, 0x54, 0x15, 0xfd, 0xbc, 0x7f, 0x3e, 0xe4, 0xa5, 0x66, 0x27, 0xab, 0xea, 0x29, 0x68, 0xb2, 0xf3, 0x30, 0x71, 0x99, 0xd8, 0x1b, 0x5a, 0x80, 0xc1, 0x2, 0x43, 0x8a, 0xcb, 0x8, 0x49, 0x93, 0xd2, 0x11, 0x50, 0xb8, 0xf9, 0x3a, 0x7b, 0xa1, 0xe0, 0x23, 0x62, 0xee, 0xaf, 0x6c, 0x2d, 0xf7, 0xb6, 0x75, 0x34, 0xdc, 0x9d, 0x5e, 0x1f, 0xc5, 0x84, 0x47, 0x6, 0x42, 0x3, 0xc0, 0x81, 0x5b, 0x1a, 0xd9, 0x98, 0x70, 0x31, 0xf2, 0xb3, 0x69, 0x28, 0xeb, 0xaa, 0x26, 0x67, 0xa4, 0xe5, 0x3f, 0x7e, 0xbd, 0xfc, 0x14, 0x55, 0x96, 0xd7, 0xd, 0x4c, 0x8f, 0xce}, + {0x0, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9, 0x54, 0x16, 0xd0, 0x92, 0x41, 0x3, 0xc5, 0x87, 0x7e, 0x3c, 0xfa, 0xb8, 0x6b, 0x29, 0xef, 0xad, 0xa8, 0xea, 0x2c, 0x6e, 0xbd, 0xff, 0x39, 0x7b, 0x82, 0xc0, 0x6, 0x44, 0x97, 0xd5, 0x13, 0x51, 0xfc, 0xbe, 0x78, 0x3a, 0xe9, 0xab, 0x6d, 0x2f, 0xd6, 0x94, 0x52, 0x10, 0xc3, 0x81, 0x47, 0x5, 0x4d, 0xf, 0xc9, 0x8b, 0x58, 0x1a, 0xdc, 0x9e, 0x67, 0x25, 0xe3, 0xa1, 0x72, 0x30, 0xf6, 0xb4, 0x19, 0x5b, 0x9d, 0xdf, 0xc, 0x4e, 0x88, 0xca, 0x33, 0x71, 0xb7, 0xf5, 0x26, 0x64, 0xa2, 0xe0, 0xe5, 0xa7, 0x61, 0x23, 0xf0, 0xb2, 0x74, 0x36, 0xcf, 0x8d, 0x4b, 0x9, 0xda, 0x98, 0x5e, 0x1c, 0xb1, 0xf3, 0x35, 0x77, 0xa4, 0xe6, 0x20, 0x62, 0x9b, 0xd9, 0x1f, 0x5d, 0x8e, 0xcc, 0xa, 0x48, 0x9a, 0xd8, 0x1e, 0x5c, 0x8f, 0xcd, 0xb, 0x49, 0xb0, 0xf2, 0x34, 0x76, 0xa5, 0xe7, 0x21, 0x63, 0xce, 0x8c, 0x4a, 0x8, 0xdb, 0x99, 0x5f, 0x1d, 0xe4, 0xa6, 0x60, 0x22, 0xf1, 0xb3, 0x75, 0x37, 0x32, 0x70, 0xb6, 0xf4, 0x27, 0x65, 0xa3, 0xe1, 0x18, 0x5a, 0x9c, 0xde, 0xd, 0x4f, 0x89, 0xcb, 0x66, 0x24, 0xe2, 0xa0, 0x73, 0x31, 0xf7, 0xb5, 0x4c, 0xe, 0xc8, 0x8a, 0x59, 0x1b, 0xdd, 0x9f, 0xd7, 0x95, 0x53, 0x11, 0xc2, 0x80, 0x46, 0x4, 0xfd, 0xbf, 0x79, 0x3b, 0xe8, 0xaa, 0x6c, 0x2e, 0x83, 0xc1, 0x7, 0x45, 0x96, 0xd4, 0x12, 0x50, 0xa9, 0xeb, 0x2d, 0x6f, 0xbc, 0xfe, 0x38, 0x7a, 0x7f, 0x3d, 0xfb, 0xb9, 0x6a, 0x28, 0xee, 0xac, 0x55, 0x17, 0xd1, 0x93, 0x40, 0x2, 0xc4, 0x86, 0x2b, 0x69, 0xaf, 0xed, 0x3e, 0x7c, 0xba, 0xf8, 0x1, 0x43, 0x85, 0xc7, 0x14, 0x56, 0x90, 0xd2}, + {0x0, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6, 0x44, 0x7, 0xc2, 0x81, 0x55, 0x16, 0xd3, 0x90, 0x66, 0x25, 0xe0, 0xa3, 0x77, 0x34, 0xf1, 0xb2, 0x88, 0xcb, 0xe, 0x4d, 0x99, 0xda, 0x1f, 0x5c, 0xaa, 0xe9, 0x2c, 0x6f, 0xbb, 0xf8, 0x3d, 0x7e, 0xcc, 0x8f, 0x4a, 0x9, 0xdd, 0x9e, 0x5b, 0x18, 0xee, 0xad, 0x68, 0x2b, 0xff, 0xbc, 0x79, 0x3a, 0xd, 0x4e, 0x8b, 0xc8, 0x1c, 0x5f, 0x9a, 0xd9, 0x2f, 0x6c, 0xa9, 0xea, 0x3e, 0x7d, 0xb8, 0xfb, 0x49, 0xa, 0xcf, 0x8c, 0x58, 0x1b, 0xde, 0x9d, 0x6b, 0x28, 0xed, 0xae, 0x7a, 0x39, 0xfc, 0xbf, 0x85, 0xc6, 0x3, 0x40, 0x94, 0xd7, 0x12, 0x51, 0xa7, 0xe4, 0x21, 0x62, 0xb6, 0xf5, 0x30, 0x73, 0xc1, 0x82, 0x47, 0x4, 0xd0, 0x93, 0x56, 0x15, 0xe3, 0xa0, 0x65, 0x26, 0xf2, 0xb1, 0x74, 0x37, 0x1a, 0x59, 0x9c, 0xdf, 0xb, 0x48, 0x8d, 0xce, 0x38, 0x7b, 0xbe, 0xfd, 0x29, 0x6a, 0xaf, 0xec, 0x5e, 0x1d, 0xd8, 0x9b, 0x4f, 0xc, 0xc9, 0x8a, 0x7c, 0x3f, 0xfa, 0xb9, 0x6d, 0x2e, 0xeb, 0xa8, 0x92, 0xd1, 0x14, 0x57, 0x83, 0xc0, 0x5, 0x46, 0xb0, 0xf3, 0x36, 0x75, 0xa1, 0xe2, 0x27, 0x64, 0xd6, 0x95, 0x50, 0x13, 0xc7, 0x84, 0x41, 0x2, 0xf4, 0xb7, 0x72, 0x31, 0xe5, 0xa6, 0x63, 0x20, 0x17, 0x54, 0x91, 0xd2, 0x6, 0x45, 0x80, 0xc3, 0x35, 0x76, 0xb3, 0xf0, 0x24, 0x67, 0xa2, 0xe1, 0x53, 0x10, 0xd5, 0x96, 0x42, 0x1, 0xc4, 0x87, 0x71, 0x32, 0xf7, 0xb4, 0x60, 0x23, 0xe6, 0xa5, 0x9f, 0xdc, 0x19, 0x5a, 0x8e, 0xcd, 0x8, 0x4b, 0xbd, 0xfe, 0x3b, 0x78, 0xac, 0xef, 0x2a, 0x69, 0xdb, 0x98, 0x5d, 0x1e, 0xca, 0x89, 0x4c, 0xf, 0xf9, 0xba, 0x7f, 0x3c, 0xe8, 0xab, 0x6e, 0x2d}, + {0x0, 0x44, 0x88, 0xcc, 0xd, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb, 0x34, 0x70, 0xbc, 0xf8, 0x39, 0x7d, 0xb1, 0xf5, 0x2e, 0x6a, 0xa6, 0xe2, 0x23, 0x67, 0xab, 0xef, 0x68, 0x2c, 0xe0, 0xa4, 0x65, 0x21, 0xed, 0xa9, 0x72, 0x36, 0xfa, 0xbe, 0x7f, 0x3b, 0xf7, 0xb3, 0x5c, 0x18, 0xd4, 0x90, 0x51, 0x15, 0xd9, 0x9d, 0x46, 0x2, 0xce, 0x8a, 0x4b, 0xf, 0xc3, 0x87, 0xd0, 0x94, 0x58, 0x1c, 0xdd, 0x99, 0x55, 0x11, 0xca, 0x8e, 0x42, 0x6, 0xc7, 0x83, 0x4f, 0xb, 0xe4, 0xa0, 0x6c, 0x28, 0xe9, 0xad, 0x61, 0x25, 0xfe, 0xba, 0x76, 0x32, 0xf3, 0xb7, 0x7b, 0x3f, 0xb8, 0xfc, 0x30, 0x74, 0xb5, 0xf1, 0x3d, 0x79, 0xa2, 0xe6, 0x2a, 0x6e, 0xaf, 0xeb, 0x27, 0x63, 0x8c, 0xc8, 0x4, 0x40, 0x81, 0xc5, 0x9, 0x4d, 0x96, 0xd2, 0x1e, 0x5a, 0x9b, 0xdf, 0x13, 0x57, 0xbd, 0xf9, 0x35, 0x71, 0xb0, 0xf4, 0x38, 0x7c, 0xa7, 0xe3, 0x2f, 0x6b, 0xaa, 0xee, 0x22, 0x66, 0x89, 0xcd, 0x1, 0x45, 0x84, 0xc0, 0xc, 0x48, 0x93, 0xd7, 0x1b, 0x5f, 0x9e, 0xda, 0x16, 0x52, 0xd5, 0x91, 0x5d, 0x19, 0xd8, 0x9c, 0x50, 0x14, 0xcf, 0x8b, 0x47, 0x3, 0xc2, 0x86, 0x4a, 0xe, 0xe1, 0xa5, 0x69, 0x2d, 0xec, 0xa8, 0x64, 0x20, 0xfb, 0xbf, 0x73, 0x37, 0xf6, 0xb2, 0x7e, 0x3a, 0x6d, 0x29, 0xe5, 0xa1, 0x60, 0x24, 0xe8, 0xac, 0x77, 0x33, 0xff, 0xbb, 0x7a, 0x3e, 0xf2, 0xb6, 0x59, 0x1d, 0xd1, 0x95, 0x54, 0x10, 0xdc, 0x98, 0x43, 0x7, 0xcb, 0x8f, 0x4e, 0xa, 0xc6, 0x82, 0x5, 0x41, 0x8d, 0xc9, 0x8, 0x4c, 0x80, 0xc4, 0x1f, 0x5b, 0x97, 0xd3, 0x12, 0x56, 0x9a, 0xde, 0x31, 0x75, 0xb9, 0xfd, 0x3c, 0x78, 0xb4, 0xf0, 0x2b, 0x6f, 0xa3, 0xe7, 0x26, 0x62, 0xae, 0xea}, + {0x0, 0x45, 0x8a, 0xcf, 0x9, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4, 0x24, 0x61, 0xae, 0xeb, 0x2d, 0x68, 0xa7, 0xe2, 0x36, 0x73, 0xbc, 0xf9, 0x3f, 0x7a, 0xb5, 0xf0, 0x48, 0xd, 0xc2, 0x87, 0x41, 0x4, 0xcb, 0x8e, 0x5a, 0x1f, 0xd0, 0x95, 0x53, 0x16, 0xd9, 0x9c, 0x6c, 0x29, 0xe6, 0xa3, 0x65, 0x20, 0xef, 0xaa, 0x7e, 0x3b, 0xf4, 0xb1, 0x77, 0x32, 0xfd, 0xb8, 0x90, 0xd5, 0x1a, 0x5f, 0x99, 0xdc, 0x13, 0x56, 0x82, 0xc7, 0x8, 0x4d, 0x8b, 0xce, 0x1, 0x44, 0xb4, 0xf1, 0x3e, 0x7b, 0xbd, 0xf8, 0x37, 0x72, 0xa6, 0xe3, 0x2c, 0x69, 0xaf, 0xea, 0x25, 0x60, 0xd8, 0x9d, 0x52, 0x17, 0xd1, 0x94, 0x5b, 0x1e, 0xca, 0x8f, 0x40, 0x5, 0xc3, 0x86, 0x49, 0xc, 0xfc, 0xb9, 0x76, 0x33, 0xf5, 0xb0, 0x7f, 0x3a, 0xee, 0xab, 0x64, 0x21, 0xe7, 0xa2, 0x6d, 0x28, 0x3d, 0x78, 0xb7, 0xf2, 0x34, 0x71, 0xbe, 0xfb, 0x2f, 0x6a, 0xa5, 0xe0, 0x26, 0x63, 0xac, 0xe9, 0x19, 0x5c, 0x93, 0xd6, 0x10, 0x55, 0x9a, 0xdf, 0xb, 0x4e, 0x81, 0xc4, 0x2, 0x47, 0x88, 0xcd, 0x75, 0x30, 0xff, 0xba, 0x7c, 0x39, 0xf6, 0xb3, 0x67, 0x22, 0xed, 0xa8, 0x6e, 0x2b, 0xe4, 0xa1, 0x51, 0x14, 0xdb, 0x9e, 0x58, 0x1d, 0xd2, 0x97, 0x43, 0x6, 0xc9, 0x8c, 0x4a, 0xf, 0xc0, 0x85, 0xad, 0xe8, 0x27, 0x62, 0xa4, 0xe1, 0x2e, 0x6b, 0xbf, 0xfa, 0x35, 0x70, 0xb6, 0xf3, 0x3c, 0x79, 0x89, 0xcc, 0x3, 0x46, 0x80, 0xc5, 0xa, 0x4f, 0x9b, 0xde, 0x11, 0x54, 0x92, 0xd7, 0x18, 0x5d, 0xe5, 0xa0, 0x6f, 0x2a, 0xec, 0xa9, 0x66, 0x23, 0xf7, 0xb2, 0x7d, 0x38, 0xfe, 0xbb, 0x74, 0x31, 0xc1, 0x84, 0x4b, 0xe, 0xc8, 0x8d, 0x42, 0x7, 0xd3, 0x96, 0x59, 0x1c, 0xda, 0x9f, 0x50, 0x15}, + {0x0, 0x46, 0x8c, 0xca, 0x5, 0x43, 0x89, 0xcf, 0xa, 0x4c, 0x86, 0xc0, 0xf, 0x49, 0x83, 0xc5, 0x14, 0x52, 0x98, 0xde, 0x11, 0x57, 0x9d, 0xdb, 0x1e, 0x58, 0x92, 0xd4, 0x1b, 0x5d, 0x97, 0xd1, 0x28, 0x6e, 0xa4, 0xe2, 0x2d, 0x6b, 0xa1, 0xe7, 0x22, 0x64, 0xae, 0xe8, 0x27, 0x61, 0xab, 0xed, 0x3c, 0x7a, 0xb0, 0xf6, 0x39, 0x7f, 0xb5, 0xf3, 0x36, 0x70, 0xba, 0xfc, 0x33, 0x75, 0xbf, 0xf9, 0x50, 0x16, 0xdc, 0x9a, 0x55, 0x13, 0xd9, 0x9f, 0x5a, 0x1c, 0xd6, 0x90, 0x5f, 0x19, 0xd3, 0x95, 0x44, 0x2, 0xc8, 0x8e, 0x41, 0x7, 0xcd, 0x8b, 0x4e, 0x8, 0xc2, 0x84, 0x4b, 0xd, 0xc7, 0x81, 0x78, 0x3e, 0xf4, 0xb2, 0x7d, 0x3b, 0xf1, 0xb7, 0x72, 0x34, 0xfe, 0xb8, 0x77, 0x31, 0xfb, 0xbd, 0x6c, 0x2a, 0xe0, 0xa6, 0x69, 0x2f, 0xe5, 0xa3, 0x66, 0x20, 0xea, 0xac, 0x63, 0x25, 0xef, 0xa9, 0xa0, 0xe6, 0x2c, 0x6a, 0xa5, 0xe3, 0x29, 0x6f, 0xaa, 0xec, 0x26, 0x60, 0xaf, 0xe9, 0x23, 0x65, 0xb4, 0xf2, 0x38, 0x7e, 0xb1, 0xf7, 0x3d, 0x7b, 0xbe, 0xf8, 0x32, 0x74, 0xbb, 0xfd, 0x37, 0x71, 0x88, 0xce, 0x4, 0x42, 0x8d, 0xcb, 0x1, 0x47, 0x82, 0xc4, 0xe, 0x48, 0x87, 0xc1, 0xb, 0x4d, 0x9c, 0xda, 0x10, 0x56, 0x99, 0xdf, 0x15, 0x53, 0x96, 0xd0, 0x1a, 0x5c, 0x93, 0xd5, 0x1f, 0x59, 0xf0, 0xb6, 0x7c, 0x3a, 0xf5, 0xb3, 0x79, 0x3f, 0xfa, 0xbc, 0x76, 0x30, 0xff, 0xb9, 0x73, 0x35, 0xe4, 0xa2, 0x68, 0x2e, 0xe1, 0xa7, 0x6d, 0x2b, 0xee, 0xa8, 0x62, 0x24, 0xeb, 0xad, 0x67, 0x21, 0xd8, 0x9e, 0x54, 0x12, 0xdd, 0x9b, 0x51, 0x17, 0xd2, 0x94, 0x5e, 0x18, 0xd7, 0x91, 0x5b, 0x1d, 0xcc, 0x8a, 0x40, 0x6, 0xc9, 0x8f, 0x45, 0x3, 0xc6, 0x80, 0x4a, 0xc, 0xc3, 0x85, 0x4f, 0x9}, + {0x0, 0x47, 0x8e, 0xc9, 0x1, 0x46, 0x8f, 0xc8, 0x2, 0x45, 0x8c, 0xcb, 0x3, 0x44, 0x8d, 0xca, 0x4, 0x43, 0x8a, 0xcd, 0x5, 0x42, 0x8b, 0xcc, 0x6, 0x41, 0x88, 0xcf, 0x7, 0x40, 0x89, 0xce, 0x8, 0x4f, 0x86, 0xc1, 0x9, 0x4e, 0x87, 0xc0, 0xa, 0x4d, 0x84, 0xc3, 0xb, 0x4c, 0x85, 0xc2, 0xc, 0x4b, 0x82, 0xc5, 0xd, 0x4a, 0x83, 0xc4, 0xe, 0x49, 0x80, 0xc7, 0xf, 0x48, 0x81, 0xc6, 0x10, 0x57, 0x9e, 0xd9, 0x11, 0x56, 0x9f, 0xd8, 0x12, 0x55, 0x9c, 0xdb, 0x13, 0x54, 0x9d, 0xda, 0x14, 0x53, 0x9a, 0xdd, 0x15, 0x52, 0x9b, 0xdc, 0x16, 0x51, 0x98, 0xdf, 0x17, 0x50, 0x99, 0xde, 0x18, 0x5f, 0x96, 0xd1, 0x19, 0x5e, 0x97, 0xd0, 0x1a, 0x5d, 0x94, 0xd3, 0x1b, 0x5c, 0x95, 0xd2, 0x1c, 0x5b, 0x92, 0xd5, 0x1d, 0x5a, 0x93, 0xd4, 0x1e, 0x59, 0x90, 0xd7, 0x1f, 0x58, 0x91, 0xd6, 0x20, 0x67, 0xae, 0xe9, 0x21, 0x66, 0xaf, 0xe8, 0x22, 0x65, 0xac, 0xeb, 0x23, 0x64, 0xad, 0xea, 0x24, 0x63, 0xaa, 0xed, 0x25, 0x62, 0xab, 0xec, 0x26, 0x61, 0xa8, 0xef, 0x27, 0x60, 0xa9, 0xee, 0x28, 0x6f, 0xa6, 0xe1, 0x29, 0x6e, 0xa7, 0xe0, 0x2a, 0x6d, 0xa4, 0xe3, 0x2b, 0x6c, 0xa5, 0xe2, 0x2c, 0x6b, 0xa2, 0xe5, 0x2d, 0x6a, 0xa3, 0xe4, 0x2e, 0x69, 0xa0, 0xe7, 0x2f, 0x68, 0xa1, 0xe6, 0x30, 0x77, 0xbe, 0xf9, 0x31, 0x76, 0xbf, 0xf8, 0x32, 0x75, 0xbc, 0xfb, 0x33, 0x74, 0xbd, 0xfa, 0x34, 0x73, 0xba, 0xfd, 0x35, 0x72, 0xbb, 0xfc, 0x36, 0x71, 0xb8, 0xff, 0x37, 0x70, 0xb9, 0xfe, 0x38, 0x7f, 0xb6, 0xf1, 0x39, 0x7e, 0xb7, 0xf0, 0x3a, 0x7d, 0xb4, 0xf3, 0x3b, 0x7c, 0xb5, 0xf2, 0x3c, 0x7b, 0xb2, 0xf5, 0x3d, 0x7a, 0xb3, 0xf4, 0x3e, 0x79, 0xb0, 0xf7, 0x3f, 0x78, 0xb1, 0xf6}, + {0x0, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0xf, 0xd7, 0x9f, 0xf4, 0xbc, 0x64, 0x2c, 0xc9, 0x81, 0x59, 0x11, 0x8e, 0xc6, 0x1e, 0x56, 0xb3, 0xfb, 0x23, 0x6b, 0xf5, 0xbd, 0x65, 0x2d, 0xc8, 0x80, 0x58, 0x10, 0x8f, 0xc7, 0x1f, 0x57, 0xb2, 0xfa, 0x22, 0x6a, 0x1, 0x49, 0x91, 0xd9, 0x3c, 0x74, 0xac, 0xe4, 0x7b, 0x33, 0xeb, 0xa3, 0x46, 0xe, 0xd6, 0x9e, 0xf7, 0xbf, 0x67, 0x2f, 0xca, 0x82, 0x5a, 0x12, 0x8d, 0xc5, 0x1d, 0x55, 0xb0, 0xf8, 0x20, 0x68, 0x3, 0x4b, 0x93, 0xdb, 0x3e, 0x76, 0xae, 0xe6, 0x79, 0x31, 0xe9, 0xa1, 0x44, 0xc, 0xd4, 0x9c, 0x2, 0x4a, 0x92, 0xda, 0x3f, 0x77, 0xaf, 0xe7, 0x78, 0x30, 0xe8, 0xa0, 0x45, 0xd, 0xd5, 0x9d, 0xf6, 0xbe, 0x66, 0x2e, 0xcb, 0x83, 0x5b, 0x13, 0x8c, 0xc4, 0x1c, 0x54, 0xb1, 0xf9, 0x21, 0x69, 0xf3, 0xbb, 0x63, 0x2b, 0xce, 0x86, 0x5e, 0x16, 0x89, 0xc1, 0x19, 0x51, 0xb4, 0xfc, 0x24, 0x6c, 0x7, 0x4f, 0x97, 0xdf, 0x3a, 0x72, 0xaa, 0xe2, 0x7d, 0x35, 0xed, 0xa5, 0x40, 0x8, 0xd0, 0x98, 0x6, 0x4e, 0x96, 0xde, 0x3b, 0x73, 0xab, 0xe3, 0x7c, 0x34, 0xec, 0xa4, 0x41, 0x9, 0xd1, 0x99, 0xf2, 0xba, 0x62, 0x2a, 0xcf, 0x87, 0x5f, 0x17, 0x88, 0xc0, 0x18, 0x50, 0xb5, 0xfd, 0x25, 0x6d, 0x4, 0x4c, 0x94, 0xdc, 0x39, 0x71, 0xa9, 0xe1, 0x7e, 0x36, 0xee, 0xa6, 0x43, 0xb, 0xd3, 0x9b, 0xf0, 0xb8, 0x60, 0x28, 0xcd, 0x85, 0x5d, 0x15, 0x8a, 0xc2, 0x1a, 0x52, 0xb7, 0xff, 0x27, 0x6f, 0xf1, 0xb9, 0x61, 0x29, 0xcc, 0x84, 0x5c, 0x14, 0x8b, 0xc3, 0x1b, 0x53, 0xb6, 0xfe, 0x26, 0x6e, 0x5, 0x4d, 0x95, 0xdd, 0x38, 0x70, 0xa8, 0xe0, 0x7f, 0x37, 0xef, 0xa7, 0x42, 0xa, 0xd2, 0x9a}, + {0x0, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x2, 0xd9, 0x90, 0xe4, 0xad, 0x76, 0x3f, 0xdd, 0x94, 0x4f, 0x6, 0x96, 0xdf, 0x4, 0x4d, 0xaf, 0xe6, 0x3d, 0x74, 0xd5, 0x9c, 0x47, 0xe, 0xec, 0xa5, 0x7e, 0x37, 0xa7, 0xee, 0x35, 0x7c, 0x9e, 0xd7, 0xc, 0x45, 0x31, 0x78, 0xa3, 0xea, 0x8, 0x41, 0x9a, 0xd3, 0x43, 0xa, 0xd1, 0x98, 0x7a, 0x33, 0xe8, 0xa1, 0xb7, 0xfe, 0x25, 0x6c, 0x8e, 0xc7, 0x1c, 0x55, 0xc5, 0x8c, 0x57, 0x1e, 0xfc, 0xb5, 0x6e, 0x27, 0x53, 0x1a, 0xc1, 0x88, 0x6a, 0x23, 0xf8, 0xb1, 0x21, 0x68, 0xb3, 0xfa, 0x18, 0x51, 0x8a, 0xc3, 0x62, 0x2b, 0xf0, 0xb9, 0x5b, 0x12, 0xc9, 0x80, 0x10, 0x59, 0x82, 0xcb, 0x29, 0x60, 0xbb, 0xf2, 0x86, 0xcf, 0x14, 0x5d, 0xbf, 0xf6, 0x2d, 0x64, 0xf4, 0xbd, 0x66, 0x2f, 0xcd, 0x84, 0x5f, 0x16, 0x73, 0x3a, 0xe1, 0xa8, 0x4a, 0x3, 0xd8, 0x91, 0x1, 0x48, 0x93, 0xda, 0x38, 0x71, 0xaa, 0xe3, 0x97, 0xde, 0x5, 0x4c, 0xae, 0xe7, 0x3c, 0x75, 0xe5, 0xac, 0x77, 0x3e, 0xdc, 0x95, 0x4e, 0x7, 0xa6, 0xef, 0x34, 0x7d, 0x9f, 0xd6, 0xd, 0x44, 0xd4, 0x9d, 0x46, 0xf, 0xed, 0xa4, 0x7f, 0x36, 0x42, 0xb, 0xd0, 0x99, 0x7b, 0x32, 0xe9, 0xa0, 0x30, 0x79, 0xa2, 0xeb, 0x9, 0x40, 0x9b, 0xd2, 0xc4, 0x8d, 0x56, 0x1f, 0xfd, 0xb4, 0x6f, 0x26, 0xb6, 0xff, 0x24, 0x6d, 0x8f, 0xc6, 0x1d, 0x54, 0x20, 0x69, 0xb2, 0xfb, 0x19, 0x50, 0x8b, 0xc2, 0x52, 0x1b, 0xc0, 0x89, 0x6b, 0x22, 0xf9, 0xb0, 0x11, 0x58, 0x83, 0xca, 0x28, 0x61, 0xba, 0xf3, 0x63, 0x2a, 0xf1, 0xb8, 0x5a, 0x13, 0xc8, 0x81, 0xf5, 0xbc, 0x67, 0x2e, 0xcc, 0x85, 0x5e, 0x17, 0x87, 0xce, 0x15, 0x5c, 0xbe, 0xf7, 0x2c, 0x65}, + {0x0, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81, 0xd4, 0x9e, 0x40, 0xa, 0xe1, 0xab, 0x75, 0x3f, 0xbe, 0xf4, 0x2a, 0x60, 0x8b, 0xc1, 0x1f, 0x55, 0xb5, 0xff, 0x21, 0x6b, 0x80, 0xca, 0x14, 0x5e, 0xdf, 0x95, 0x4b, 0x1, 0xea, 0xa0, 0x7e, 0x34, 0x61, 0x2b, 0xf5, 0xbf, 0x54, 0x1e, 0xc0, 0x8a, 0xb, 0x41, 0x9f, 0xd5, 0x3e, 0x74, 0xaa, 0xe0, 0x77, 0x3d, 0xe3, 0xa9, 0x42, 0x8, 0xd6, 0x9c, 0x1d, 0x57, 0x89, 0xc3, 0x28, 0x62, 0xbc, 0xf6, 0xa3, 0xe9, 0x37, 0x7d, 0x96, 0xdc, 0x2, 0x48, 0xc9, 0x83, 0x5d, 0x17, 0xfc, 0xb6, 0x68, 0x22, 0xc2, 0x88, 0x56, 0x1c, 0xf7, 0xbd, 0x63, 0x29, 0xa8, 0xe2, 0x3c, 0x76, 0x9d, 0xd7, 0x9, 0x43, 0x16, 0x5c, 0x82, 0xc8, 0x23, 0x69, 0xb7, 0xfd, 0x7c, 0x36, 0xe8, 0xa2, 0x49, 0x3, 0xdd, 0x97, 0xee, 0xa4, 0x7a, 0x30, 0xdb, 0x91, 0x4f, 0x5, 0x84, 0xce, 0x10, 0x5a, 0xb1, 0xfb, 0x25, 0x6f, 0x3a, 0x70, 0xae, 0xe4, 0xf, 0x45, 0x9b, 0xd1, 0x50, 0x1a, 0xc4, 0x8e, 0x65, 0x2f, 0xf1, 0xbb, 0x5b, 0x11, 0xcf, 0x85, 0x6e, 0x24, 0xfa, 0xb0, 0x31, 0x7b, 0xa5, 0xef, 0x4, 0x4e, 0x90, 0xda, 0x8f, 0xc5, 0x1b, 0x51, 0xba, 0xf0, 0x2e, 0x64, 0xe5, 0xaf, 0x71, 0x3b, 0xd0, 0x9a, 0x44, 0xe, 0x99, 0xd3, 0xd, 0x47, 0xac, 0xe6, 0x38, 0x72, 0xf3, 0xb9, 0x67, 0x2d, 0xc6, 0x8c, 0x52, 0x18, 0x4d, 0x7, 0xd9, 0x93, 0x78, 0x32, 0xec, 0xa6, 0x27, 0x6d, 0xb3, 0xf9, 0x12, 0x58, 0x86, 0xcc, 0x2c, 0x66, 0xb8, 0xf2, 0x19, 0x53, 0x8d, 0xc7, 0x46, 0xc, 0xd2, 0x98, 0x73, 0x39, 0xe7, 0xad, 0xf8, 0xb2, 0x6c, 0x26, 0xcd, 0x87, 0x59, 0x13, 0x92, 0xd8, 0x6, 0x4c, 0xa7, 0xed, 0x33, 0x79}, + {0x0, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e, 0xc4, 0x8f, 0x52, 0x19, 0xf5, 0xbe, 0x63, 0x28, 0xa6, 0xed, 0x30, 0x7b, 0x97, 0xdc, 0x1, 0x4a, 0x95, 0xde, 0x3, 0x48, 0xa4, 0xef, 0x32, 0x79, 0xf7, 0xbc, 0x61, 0x2a, 0xc6, 0x8d, 0x50, 0x1b, 0x51, 0x1a, 0xc7, 0x8c, 0x60, 0x2b, 0xf6, 0xbd, 0x33, 0x78, 0xa5, 0xee, 0x2, 0x49, 0x94, 0xdf, 0x37, 0x7c, 0xa1, 0xea, 0x6, 0x4d, 0x90, 0xdb, 0x55, 0x1e, 0xc3, 0x88, 0x64, 0x2f, 0xf2, 0xb9, 0xf3, 0xb8, 0x65, 0x2e, 0xc2, 0x89, 0x54, 0x1f, 0x91, 0xda, 0x7, 0x4c, 0xa0, 0xeb, 0x36, 0x7d, 0xa2, 0xe9, 0x34, 0x7f, 0x93, 0xd8, 0x5, 0x4e, 0xc0, 0x8b, 0x56, 0x1d, 0xf1, 0xba, 0x67, 0x2c, 0x66, 0x2d, 0xf0, 0xbb, 0x57, 0x1c, 0xc1, 0x8a, 0x4, 0x4f, 0x92, 0xd9, 0x35, 0x7e, 0xa3, 0xe8, 0x6e, 0x25, 0xf8, 0xb3, 0x5f, 0x14, 0xc9, 0x82, 0xc, 0x47, 0x9a, 0xd1, 0x3d, 0x76, 0xab, 0xe0, 0xaa, 0xe1, 0x3c, 0x77, 0x9b, 0xd0, 0xd, 0x46, 0xc8, 0x83, 0x5e, 0x15, 0xf9, 0xb2, 0x6f, 0x24, 0xfb, 0xb0, 0x6d, 0x26, 0xca, 0x81, 0x5c, 0x17, 0x99, 0xd2, 0xf, 0x44, 0xa8, 0xe3, 0x3e, 0x75, 0x3f, 0x74, 0xa9, 0xe2, 0xe, 0x45, 0x98, 0xd3, 0x5d, 0x16, 0xcb, 0x80, 0x6c, 0x27, 0xfa, 0xb1, 0x59, 0x12, 0xcf, 0x84, 0x68, 0x23, 0xfe, 0xb5, 0x3b, 0x70, 0xad, 0xe6, 0xa, 0x41, 0x9c, 0xd7, 0x9d, 0xd6, 0xb, 0x40, 0xac, 0xe7, 0x3a, 0x71, 0xff, 0xb4, 0x69, 0x22, 0xce, 0x85, 0x58, 0x13, 0xcc, 0x87, 0x5a, 0x11, 0xfd, 0xb6, 0x6b, 0x20, 0xae, 0xe5, 0x38, 0x73, 0x9f, 0xd4, 0x9, 0x42, 0x8, 0x43, 0x9e, 0xd5, 0x39, 0x72, 0xaf, 0xe4, 0x6a, 0x21, 0xfc, 0xb7, 0x5b, 0x10, 0xcd, 0x86}, + {0x0, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3, 0xb4, 0xf8, 0x2c, 0x60, 0x99, 0xd5, 0x1, 0x4d, 0xee, 0xa2, 0x76, 0x3a, 0xc3, 0x8f, 0x5b, 0x17, 0x75, 0x39, 0xed, 0xa1, 0x58, 0x14, 0xc0, 0x8c, 0x2f, 0x63, 0xb7, 0xfb, 0x2, 0x4e, 0x9a, 0xd6, 0xc1, 0x8d, 0x59, 0x15, 0xec, 0xa0, 0x74, 0x38, 0x9b, 0xd7, 0x3, 0x4f, 0xb6, 0xfa, 0x2e, 0x62, 0xea, 0xa6, 0x72, 0x3e, 0xc7, 0x8b, 0x5f, 0x13, 0xb0, 0xfc, 0x28, 0x64, 0x9d, 0xd1, 0x5, 0x49, 0x5e, 0x12, 0xc6, 0x8a, 0x73, 0x3f, 0xeb, 0xa7, 0x4, 0x48, 0x9c, 0xd0, 0x29, 0x65, 0xb1, 0xfd, 0x9f, 0xd3, 0x7, 0x4b, 0xb2, 0xfe, 0x2a, 0x66, 0xc5, 0x89, 0x5d, 0x11, 0xe8, 0xa4, 0x70, 0x3c, 0x2b, 0x67, 0xb3, 0xff, 0x6, 0x4a, 0x9e, 0xd2, 0x71, 0x3d, 0xe9, 0xa5, 0x5c, 0x10, 0xc4, 0x88, 0xc9, 0x85, 0x51, 0x1d, 0xe4, 0xa8, 0x7c, 0x30, 0x93, 0xdf, 0xb, 0x47, 0xbe, 0xf2, 0x26, 0x6a, 0x7d, 0x31, 0xe5, 0xa9, 0x50, 0x1c, 0xc8, 0x84, 0x27, 0x6b, 0xbf, 0xf3, 0xa, 0x46, 0x92, 0xde, 0xbc, 0xf0, 0x24, 0x68, 0x91, 0xdd, 0x9, 0x45, 0xe6, 0xaa, 0x7e, 0x32, 0xcb, 0x87, 0x53, 0x1f, 0x8, 0x44, 0x90, 0xdc, 0x25, 0x69, 0xbd, 0xf1, 0x52, 0x1e, 0xca, 0x86, 0x7f, 0x33, 0xe7, 0xab, 0x23, 0x6f, 0xbb, 0xf7, 0xe, 0x42, 0x96, 0xda, 0x79, 0x35, 0xe1, 0xad, 0x54, 0x18, 0xcc, 0x80, 0x97, 0xdb, 0xf, 0x43, 0xba, 0xf6, 0x22, 0x6e, 0xcd, 0x81, 0x55, 0x19, 0xe0, 0xac, 0x78, 0x34, 0x56, 0x1a, 0xce, 0x82, 0x7b, 0x37, 0xe3, 0xaf, 0xc, 0x40, 0x94, 0xd8, 0x21, 0x6d, 0xb9, 0xf5, 0xe2, 0xae, 0x7a, 0x36, 0xcf, 0x83, 0x57, 0x1b, 0xb8, 0xf4, 0x20, 0x6c, 0x95, 0xd9, 0xd, 0x41}, + {0x0, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac, 0xa4, 0xe9, 0x3e, 0x73, 0x8d, 0xc0, 0x17, 0x5a, 0xf6, 0xbb, 0x6c, 0x21, 0xdf, 0x92, 0x45, 0x8, 0x55, 0x18, 0xcf, 0x82, 0x7c, 0x31, 0xe6, 0xab, 0x7, 0x4a, 0x9d, 0xd0, 0x2e, 0x63, 0xb4, 0xf9, 0xf1, 0xbc, 0x6b, 0x26, 0xd8, 0x95, 0x42, 0xf, 0xa3, 0xee, 0x39, 0x74, 0x8a, 0xc7, 0x10, 0x5d, 0xaa, 0xe7, 0x30, 0x7d, 0x83, 0xce, 0x19, 0x54, 0xf8, 0xb5, 0x62, 0x2f, 0xd1, 0x9c, 0x4b, 0x6, 0xe, 0x43, 0x94, 0xd9, 0x27, 0x6a, 0xbd, 0xf0, 0x5c, 0x11, 0xc6, 0x8b, 0x75, 0x38, 0xef, 0xa2, 0xff, 0xb2, 0x65, 0x28, 0xd6, 0x9b, 0x4c, 0x1, 0xad, 0xe0, 0x37, 0x7a, 0x84, 0xc9, 0x1e, 0x53, 0x5b, 0x16, 0xc1, 0x8c, 0x72, 0x3f, 0xe8, 0xa5, 0x9, 0x44, 0x93, 0xde, 0x20, 0x6d, 0xba, 0xf7, 0x49, 0x4, 0xd3, 0x9e, 0x60, 0x2d, 0xfa, 0xb7, 0x1b, 0x56, 0x81, 0xcc, 0x32, 0x7f, 0xa8, 0xe5, 0xed, 0xa0, 0x77, 0x3a, 0xc4, 0x89, 0x5e, 0x13, 0xbf, 0xf2, 0x25, 0x68, 0x96, 0xdb, 0xc, 0x41, 0x1c, 0x51, 0x86, 0xcb, 0x35, 0x78, 0xaf, 0xe2, 0x4e, 0x3, 0xd4, 0x99, 0x67, 0x2a, 0xfd, 0xb0, 0xb8, 0xf5, 0x22, 0x6f, 0x91, 0xdc, 0xb, 0x46, 0xea, 0xa7, 0x70, 0x3d, 0xc3, 0x8e, 0x59, 0x14, 0xe3, 0xae, 0x79, 0x34, 0xca, 0x87, 0x50, 0x1d, 0xb1, 0xfc, 0x2b, 0x66, 0x98, 0xd5, 0x2, 0x4f, 0x47, 0xa, 0xdd, 0x90, 0x6e, 0x23, 0xf4, 0xb9, 0x15, 0x58, 0x8f, 0xc2, 0x3c, 0x71, 0xa6, 0xeb, 0xb6, 0xfb, 0x2c, 0x61, 0x9f, 0xd2, 0x5, 0x48, 0xe4, 0xa9, 0x7e, 0x33, 0xcd, 0x80, 0x57, 0x1a, 0x12, 0x5f, 0x88, 0xc5, 0x3b, 0x76, 0xa1, 0xec, 0x40, 0xd, 0xda, 0x97, 0x69, 0x24, 0xf3, 0xbe}, + {0x0, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x4, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd, 0x94, 0xda, 0x8, 0x46, 0xb1, 0xff, 0x2d, 0x63, 0xde, 0x90, 0x42, 0xc, 0xfb, 0xb5, 0x67, 0x29, 0x35, 0x7b, 0xa9, 0xe7, 0x10, 0x5e, 0x8c, 0xc2, 0x7f, 0x31, 0xe3, 0xad, 0x5a, 0x14, 0xc6, 0x88, 0xa1, 0xef, 0x3d, 0x73, 0x84, 0xca, 0x18, 0x56, 0xeb, 0xa5, 0x77, 0x39, 0xce, 0x80, 0x52, 0x1c, 0x6a, 0x24, 0xf6, 0xb8, 0x4f, 0x1, 0xd3, 0x9d, 0x20, 0x6e, 0xbc, 0xf2, 0x5, 0x4b, 0x99, 0xd7, 0xfe, 0xb0, 0x62, 0x2c, 0xdb, 0x95, 0x47, 0x9, 0xb4, 0xfa, 0x28, 0x66, 0x91, 0xdf, 0xd, 0x43, 0x5f, 0x11, 0xc3, 0x8d, 0x7a, 0x34, 0xe6, 0xa8, 0x15, 0x5b, 0x89, 0xc7, 0x30, 0x7e, 0xac, 0xe2, 0xcb, 0x85, 0x57, 0x19, 0xee, 0xa0, 0x72, 0x3c, 0x81, 0xcf, 0x1d, 0x53, 0xa4, 0xea, 0x38, 0x76, 0xd4, 0x9a, 0x48, 0x6, 0xf1, 0xbf, 0x6d, 0x23, 0x9e, 0xd0, 0x2, 0x4c, 0xbb, 0xf5, 0x27, 0x69, 0x40, 0xe, 0xdc, 0x92, 0x65, 0x2b, 0xf9, 0xb7, 0xa, 0x44, 0x96, 0xd8, 0x2f, 0x61, 0xb3, 0xfd, 0xe1, 0xaf, 0x7d, 0x33, 0xc4, 0x8a, 0x58, 0x16, 0xab, 0xe5, 0x37, 0x79, 0x8e, 0xc0, 0x12, 0x5c, 0x75, 0x3b, 0xe9, 0xa7, 0x50, 0x1e, 0xcc, 0x82, 0x3f, 0x71, 0xa3, 0xed, 0x1a, 0x54, 0x86, 0xc8, 0xbe, 0xf0, 0x22, 0x6c, 0x9b, 0xd5, 0x7, 0x49, 0xf4, 0xba, 0x68, 0x26, 0xd1, 0x9f, 0x4d, 0x3, 0x2a, 0x64, 0xb6, 0xf8, 0xf, 0x41, 0x93, 0xdd, 0x60, 0x2e, 0xfc, 0xb2, 0x45, 0xb, 0xd9, 0x97, 0x8b, 0xc5, 0x17, 0x59, 0xae, 0xe0, 0x32, 0x7c, 0xc1, 0x8f, 0x5d, 0x13, 0xe4, 0xaa, 0x78, 0x36, 0x1f, 0x51, 0x83, 0xcd, 0x3a, 0x74, 0xa6, 0xe8, 0x55, 0x1b, 0xc9, 0x87, 0x70, 0x3e, 0xec, 0xa2}, + {0x0, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0xd, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2, 0x84, 0xcb, 0x1a, 0x55, 0xa5, 0xea, 0x3b, 0x74, 0xc6, 0x89, 0x58, 0x17, 0xe7, 0xa8, 0x79, 0x36, 0x15, 0x5a, 0x8b, 0xc4, 0x34, 0x7b, 0xaa, 0xe5, 0x57, 0x18, 0xc9, 0x86, 0x76, 0x39, 0xe8, 0xa7, 0x91, 0xde, 0xf, 0x40, 0xb0, 0xff, 0x2e, 0x61, 0xd3, 0x9c, 0x4d, 0x2, 0xf2, 0xbd, 0x6c, 0x23, 0x2a, 0x65, 0xb4, 0xfb, 0xb, 0x44, 0x95, 0xda, 0x68, 0x27, 0xf6, 0xb9, 0x49, 0x6, 0xd7, 0x98, 0xae, 0xe1, 0x30, 0x7f, 0x8f, 0xc0, 0x11, 0x5e, 0xec, 0xa3, 0x72, 0x3d, 0xcd, 0x82, 0x53, 0x1c, 0x3f, 0x70, 0xa1, 0xee, 0x1e, 0x51, 0x80, 0xcf, 0x7d, 0x32, 0xe3, 0xac, 0x5c, 0x13, 0xc2, 0x8d, 0xbb, 0xf4, 0x25, 0x6a, 0x9a, 0xd5, 0x4, 0x4b, 0xf9, 0xb6, 0x67, 0x28, 0xd8, 0x97, 0x46, 0x9, 0x54, 0x1b, 0xca, 0x85, 0x75, 0x3a, 0xeb, 0xa4, 0x16, 0x59, 0x88, 0xc7, 0x37, 0x78, 0xa9, 0xe6, 0xd0, 0x9f, 0x4e, 0x1, 0xf1, 0xbe, 0x6f, 0x20, 0x92, 0xdd, 0xc, 0x43, 0xb3, 0xfc, 0x2d, 0x62, 0x41, 0xe, 0xdf, 0x90, 0x60, 0x2f, 0xfe, 0xb1, 0x3, 0x4c, 0x9d, 0xd2, 0x22, 0x6d, 0xbc, 0xf3, 0xc5, 0x8a, 0x5b, 0x14, 0xe4, 0xab, 0x7a, 0x35, 0x87, 0xc8, 0x19, 0x56, 0xa6, 0xe9, 0x38, 0x77, 0x7e, 0x31, 0xe0, 0xaf, 0x5f, 0x10, 0xc1, 0x8e, 0x3c, 0x73, 0xa2, 0xed, 0x1d, 0x52, 0x83, 0xcc, 0xfa, 0xb5, 0x64, 0x2b, 0xdb, 0x94, 0x45, 0xa, 0xb8, 0xf7, 0x26, 0x69, 0x99, 0xd6, 0x7, 0x48, 0x6b, 0x24, 0xf5, 0xba, 0x4a, 0x5, 0xd4, 0x9b, 0x29, 0x66, 0xb7, 0xf8, 0x8, 0x47, 0x96, 0xd9, 0xef, 0xa0, 0x71, 0x3e, 0xce, 0x81, 0x50, 0x1f, 0xad, 0xe2, 0x33, 0x7c, 0x8c, 0xc3, 0x12, 0x5d}, + {0x0, 0x50, 0xa0, 0xf0, 0x5d, 0xd, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17, 0x69, 0x39, 0xc9, 0x99, 0x34, 0x64, 0x94, 0xc4, 0xd3, 0x83, 0x73, 0x23, 0x8e, 0xde, 0x2e, 0x7e, 0xd2, 0x82, 0x72, 0x22, 0x8f, 0xdf, 0x2f, 0x7f, 0x68, 0x38, 0xc8, 0x98, 0x35, 0x65, 0x95, 0xc5, 0xbb, 0xeb, 0x1b, 0x4b, 0xe6, 0xb6, 0x46, 0x16, 0x1, 0x51, 0xa1, 0xf1, 0x5c, 0xc, 0xfc, 0xac, 0xb9, 0xe9, 0x19, 0x49, 0xe4, 0xb4, 0x44, 0x14, 0x3, 0x53, 0xa3, 0xf3, 0x5e, 0xe, 0xfe, 0xae, 0xd0, 0x80, 0x70, 0x20, 0x8d, 0xdd, 0x2d, 0x7d, 0x6a, 0x3a, 0xca, 0x9a, 0x37, 0x67, 0x97, 0xc7, 0x6b, 0x3b, 0xcb, 0x9b, 0x36, 0x66, 0x96, 0xc6, 0xd1, 0x81, 0x71, 0x21, 0x8c, 0xdc, 0x2c, 0x7c, 0x2, 0x52, 0xa2, 0xf2, 0x5f, 0xf, 0xff, 0xaf, 0xb8, 0xe8, 0x18, 0x48, 0xe5, 0xb5, 0x45, 0x15, 0x6f, 0x3f, 0xcf, 0x9f, 0x32, 0x62, 0x92, 0xc2, 0xd5, 0x85, 0x75, 0x25, 0x88, 0xd8, 0x28, 0x78, 0x6, 0x56, 0xa6, 0xf6, 0x5b, 0xb, 0xfb, 0xab, 0xbc, 0xec, 0x1c, 0x4c, 0xe1, 0xb1, 0x41, 0x11, 0xbd, 0xed, 0x1d, 0x4d, 0xe0, 0xb0, 0x40, 0x10, 0x7, 0x57, 0xa7, 0xf7, 0x5a, 0xa, 0xfa, 0xaa, 0xd4, 0x84, 0x74, 0x24, 0x89, 0xd9, 0x29, 0x79, 0x6e, 0x3e, 0xce, 0x9e, 0x33, 0x63, 0x93, 0xc3, 0xd6, 0x86, 0x76, 0x26, 0x8b, 0xdb, 0x2b, 0x7b, 0x6c, 0x3c, 0xcc, 0x9c, 0x31, 0x61, 0x91, 0xc1, 0xbf, 0xef, 0x1f, 0x4f, 0xe2, 0xb2, 0x42, 0x12, 0x5, 0x55, 0xa5, 0xf5, 0x58, 0x8, 0xf8, 0xa8, 0x4, 0x54, 0xa4, 0xf4, 0x59, 0x9, 0xf9, 0xa9, 0xbe, 0xee, 0x1e, 0x4e, 0xe3, 0xb3, 0x43, 0x13, 0x6d, 0x3d, 0xcd, 0x9d, 0x30, 0x60, 0x90, 0xc0, 0xd7, 0x87, 0x77, 0x27, 0x8a, 0xda, 0x2a, 0x7a}, + {0x0, 0x51, 0xa2, 0xf3, 0x59, 0x8, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18, 0x79, 0x28, 0xdb, 0x8a, 0x20, 0x71, 0x82, 0xd3, 0xcb, 0x9a, 0x69, 0x38, 0x92, 0xc3, 0x30, 0x61, 0xf2, 0xa3, 0x50, 0x1, 0xab, 0xfa, 0x9, 0x58, 0x40, 0x11, 0xe2, 0xb3, 0x19, 0x48, 0xbb, 0xea, 0x8b, 0xda, 0x29, 0x78, 0xd2, 0x83, 0x70, 0x21, 0x39, 0x68, 0x9b, 0xca, 0x60, 0x31, 0xc2, 0x93, 0xf9, 0xa8, 0x5b, 0xa, 0xa0, 0xf1, 0x2, 0x53, 0x4b, 0x1a, 0xe9, 0xb8, 0x12, 0x43, 0xb0, 0xe1, 0x80, 0xd1, 0x22, 0x73, 0xd9, 0x88, 0x7b, 0x2a, 0x32, 0x63, 0x90, 0xc1, 0x6b, 0x3a, 0xc9, 0x98, 0xb, 0x5a, 0xa9, 0xf8, 0x52, 0x3, 0xf0, 0xa1, 0xb9, 0xe8, 0x1b, 0x4a, 0xe0, 0xb1, 0x42, 0x13, 0x72, 0x23, 0xd0, 0x81, 0x2b, 0x7a, 0x89, 0xd8, 0xc0, 0x91, 0x62, 0x33, 0x99, 0xc8, 0x3b, 0x6a, 0xef, 0xbe, 0x4d, 0x1c, 0xb6, 0xe7, 0x14, 0x45, 0x5d, 0xc, 0xff, 0xae, 0x4, 0x55, 0xa6, 0xf7, 0x96, 0xc7, 0x34, 0x65, 0xcf, 0x9e, 0x6d, 0x3c, 0x24, 0x75, 0x86, 0xd7, 0x7d, 0x2c, 0xdf, 0x8e, 0x1d, 0x4c, 0xbf, 0xee, 0x44, 0x15, 0xe6, 0xb7, 0xaf, 0xfe, 0xd, 0x5c, 0xf6, 0xa7, 0x54, 0x5, 0x64, 0x35, 0xc6, 0x97, 0x3d, 0x6c, 0x9f, 0xce, 0xd6, 0x87, 0x74, 0x25, 0x8f, 0xde, 0x2d, 0x7c, 0x16, 0x47, 0xb4, 0xe5, 0x4f, 0x1e, 0xed, 0xbc, 0xa4, 0xf5, 0x6, 0x57, 0xfd, 0xac, 0x5f, 0xe, 0x6f, 0x3e, 0xcd, 0x9c, 0x36, 0x67, 0x94, 0xc5, 0xdd, 0x8c, 0x7f, 0x2e, 0x84, 0xd5, 0x26, 0x77, 0xe4, 0xb5, 0x46, 0x17, 0xbd, 0xec, 0x1f, 0x4e, 0x56, 0x7, 0xf4, 0xa5, 0xf, 0x5e, 0xad, 0xfc, 0x9d, 0xcc, 0x3f, 0x6e, 0xc4, 0x95, 0x66, 0x37, 0x2f, 0x7e, 0x8d, 0xdc, 0x76, 0x27, 0xd4, 0x85}, + {0x0, 0x52, 0xa4, 0xf6, 0x55, 0x7, 0xf1, 0xa3, 0xaa, 0xf8, 0xe, 0x5c, 0xff, 0xad, 0x5b, 0x9, 0x49, 0x1b, 0xed, 0xbf, 0x1c, 0x4e, 0xb8, 0xea, 0xe3, 0xb1, 0x47, 0x15, 0xb6, 0xe4, 0x12, 0x40, 0x92, 0xc0, 0x36, 0x64, 0xc7, 0x95, 0x63, 0x31, 0x38, 0x6a, 0x9c, 0xce, 0x6d, 0x3f, 0xc9, 0x9b, 0xdb, 0x89, 0x7f, 0x2d, 0x8e, 0xdc, 0x2a, 0x78, 0x71, 0x23, 0xd5, 0x87, 0x24, 0x76, 0x80, 0xd2, 0x39, 0x6b, 0x9d, 0xcf, 0x6c, 0x3e, 0xc8, 0x9a, 0x93, 0xc1, 0x37, 0x65, 0xc6, 0x94, 0x62, 0x30, 0x70, 0x22, 0xd4, 0x86, 0x25, 0x77, 0x81, 0xd3, 0xda, 0x88, 0x7e, 0x2c, 0x8f, 0xdd, 0x2b, 0x79, 0xab, 0xf9, 0xf, 0x5d, 0xfe, 0xac, 0x5a, 0x8, 0x1, 0x53, 0xa5, 0xf7, 0x54, 0x6, 0xf0, 0xa2, 0xe2, 0xb0, 0x46, 0x14, 0xb7, 0xe5, 0x13, 0x41, 0x48, 0x1a, 0xec, 0xbe, 0x1d, 0x4f, 0xb9, 0xeb, 0x72, 0x20, 0xd6, 0x84, 0x27, 0x75, 0x83, 0xd1, 0xd8, 0x8a, 0x7c, 0x2e, 0x8d, 0xdf, 0x29, 0x7b, 0x3b, 0x69, 0x9f, 0xcd, 0x6e, 0x3c, 0xca, 0x98, 0x91, 0xc3, 0x35, 0x67, 0xc4, 0x96, 0x60, 0x32, 0xe0, 0xb2, 0x44, 0x16, 0xb5, 0xe7, 0x11, 0x43, 0x4a, 0x18, 0xee, 0xbc, 0x1f, 0x4d, 0xbb, 0xe9, 0xa9, 0xfb, 0xd, 0x5f, 0xfc, 0xae, 0x58, 0xa, 0x3, 0x51, 0xa7, 0xf5, 0x56, 0x4, 0xf2, 0xa0, 0x4b, 0x19, 0xef, 0xbd, 0x1e, 0x4c, 0xba, 0xe8, 0xe1, 0xb3, 0x45, 0x17, 0xb4, 0xe6, 0x10, 0x42, 0x2, 0x50, 0xa6, 0xf4, 0x57, 0x5, 0xf3, 0xa1, 0xa8, 0xfa, 0xc, 0x5e, 0xfd, 0xaf, 0x59, 0xb, 0xd9, 0x8b, 0x7d, 0x2f, 0x8c, 0xde, 0x28, 0x7a, 0x73, 0x21, 0xd7, 0x85, 0x26, 0x74, 0x82, 0xd0, 0x90, 0xc2, 0x34, 0x66, 0xc5, 0x97, 0x61, 0x33, 0x3a, 0x68, 0x9e, 0xcc, 0x6f, 0x3d, 0xcb, 0x99}, + {0x0, 0x53, 0xa6, 0xf5, 0x51, 0x2, 0xf7, 0xa4, 0xa2, 0xf1, 0x4, 0x57, 0xf3, 0xa0, 0x55, 0x6, 0x59, 0xa, 0xff, 0xac, 0x8, 0x5b, 0xae, 0xfd, 0xfb, 0xa8, 0x5d, 0xe, 0xaa, 0xf9, 0xc, 0x5f, 0xb2, 0xe1, 0x14, 0x47, 0xe3, 0xb0, 0x45, 0x16, 0x10, 0x43, 0xb6, 0xe5, 0x41, 0x12, 0xe7, 0xb4, 0xeb, 0xb8, 0x4d, 0x1e, 0xba, 0xe9, 0x1c, 0x4f, 0x49, 0x1a, 0xef, 0xbc, 0x18, 0x4b, 0xbe, 0xed, 0x79, 0x2a, 0xdf, 0x8c, 0x28, 0x7b, 0x8e, 0xdd, 0xdb, 0x88, 0x7d, 0x2e, 0x8a, 0xd9, 0x2c, 0x7f, 0x20, 0x73, 0x86, 0xd5, 0x71, 0x22, 0xd7, 0x84, 0x82, 0xd1, 0x24, 0x77, 0xd3, 0x80, 0x75, 0x26, 0xcb, 0x98, 0x6d, 0x3e, 0x9a, 0xc9, 0x3c, 0x6f, 0x69, 0x3a, 0xcf, 0x9c, 0x38, 0x6b, 0x9e, 0xcd, 0x92, 0xc1, 0x34, 0x67, 0xc3, 0x90, 0x65, 0x36, 0x30, 0x63, 0x96, 0xc5, 0x61, 0x32, 0xc7, 0x94, 0xf2, 0xa1, 0x54, 0x7, 0xa3, 0xf0, 0x5, 0x56, 0x50, 0x3, 0xf6, 0xa5, 0x1, 0x52, 0xa7, 0xf4, 0xab, 0xf8, 0xd, 0x5e, 0xfa, 0xa9, 0x5c, 0xf, 0x9, 0x5a, 0xaf, 0xfc, 0x58, 0xb, 0xfe, 0xad, 0x40, 0x13, 0xe6, 0xb5, 0x11, 0x42, 0xb7, 0xe4, 0xe2, 0xb1, 0x44, 0x17, 0xb3, 0xe0, 0x15, 0x46, 0x19, 0x4a, 0xbf, 0xec, 0x48, 0x1b, 0xee, 0xbd, 0xbb, 0xe8, 0x1d, 0x4e, 0xea, 0xb9, 0x4c, 0x1f, 0x8b, 0xd8, 0x2d, 0x7e, 0xda, 0x89, 0x7c, 0x2f, 0x29, 0x7a, 0x8f, 0xdc, 0x78, 0x2b, 0xde, 0x8d, 0xd2, 0x81, 0x74, 0x27, 0x83, 0xd0, 0x25, 0x76, 0x70, 0x23, 0xd6, 0x85, 0x21, 0x72, 0x87, 0xd4, 0x39, 0x6a, 0x9f, 0xcc, 0x68, 0x3b, 0xce, 0x9d, 0x9b, 0xc8, 0x3d, 0x6e, 0xca, 0x99, 0x6c, 0x3f, 0x60, 0x33, 0xc6, 0x95, 0x31, 0x62, 0x97, 0xc4, 0xc2, 0x91, 0x64, 0x37, 0x93, 0xc0, 0x35, 0x66}, + {0x0, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b, 0x29, 0x7d, 0x81, 0xd5, 0x64, 0x30, 0xcc, 0x98, 0xb3, 0xe7, 0x1b, 0x4f, 0xfe, 0xaa, 0x56, 0x2, 0x52, 0x6, 0xfa, 0xae, 0x1f, 0x4b, 0xb7, 0xe3, 0xc8, 0x9c, 0x60, 0x34, 0x85, 0xd1, 0x2d, 0x79, 0x7b, 0x2f, 0xd3, 0x87, 0x36, 0x62, 0x9e, 0xca, 0xe1, 0xb5, 0x49, 0x1d, 0xac, 0xf8, 0x4, 0x50, 0xa4, 0xf0, 0xc, 0x58, 0xe9, 0xbd, 0x41, 0x15, 0x3e, 0x6a, 0x96, 0xc2, 0x73, 0x27, 0xdb, 0x8f, 0x8d, 0xd9, 0x25, 0x71, 0xc0, 0x94, 0x68, 0x3c, 0x17, 0x43, 0xbf, 0xeb, 0x5a, 0xe, 0xf2, 0xa6, 0xf6, 0xa2, 0x5e, 0xa, 0xbb, 0xef, 0x13, 0x47, 0x6c, 0x38, 0xc4, 0x90, 0x21, 0x75, 0x89, 0xdd, 0xdf, 0x8b, 0x77, 0x23, 0x92, 0xc6, 0x3a, 0x6e, 0x45, 0x11, 0xed, 0xb9, 0x8, 0x5c, 0xa0, 0xf4, 0x55, 0x1, 0xfd, 0xa9, 0x18, 0x4c, 0xb0, 0xe4, 0xcf, 0x9b, 0x67, 0x33, 0x82, 0xd6, 0x2a, 0x7e, 0x7c, 0x28, 0xd4, 0x80, 0x31, 0x65, 0x99, 0xcd, 0xe6, 0xb2, 0x4e, 0x1a, 0xab, 0xff, 0x3, 0x57, 0x7, 0x53, 0xaf, 0xfb, 0x4a, 0x1e, 0xe2, 0xb6, 0x9d, 0xc9, 0x35, 0x61, 0xd0, 0x84, 0x78, 0x2c, 0x2e, 0x7a, 0x86, 0xd2, 0x63, 0x37, 0xcb, 0x9f, 0xb4, 0xe0, 0x1c, 0x48, 0xf9, 0xad, 0x51, 0x5, 0xf1, 0xa5, 0x59, 0xd, 0xbc, 0xe8, 0x14, 0x40, 0x6b, 0x3f, 0xc3, 0x97, 0x26, 0x72, 0x8e, 0xda, 0xd8, 0x8c, 0x70, 0x24, 0x95, 0xc1, 0x3d, 0x69, 0x42, 0x16, 0xea, 0xbe, 0xf, 0x5b, 0xa7, 0xf3, 0xa3, 0xf7, 0xb, 0x5f, 0xee, 0xba, 0x46, 0x12, 0x39, 0x6d, 0x91, 0xc5, 0x74, 0x20, 0xdc, 0x88, 0x8a, 0xde, 0x22, 0x76, 0xc7, 0x93, 0x6f, 0x3b, 0x10, 0x44, 0xb8, 0xec, 0x5d, 0x9, 0xf5, 0xa1}, + {0x0, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24, 0x39, 0x6c, 0x93, 0xc6, 0x70, 0x25, 0xda, 0x8f, 0xab, 0xfe, 0x1, 0x54, 0xe2, 0xb7, 0x48, 0x1d, 0x72, 0x27, 0xd8, 0x8d, 0x3b, 0x6e, 0x91, 0xc4, 0xe0, 0xb5, 0x4a, 0x1f, 0xa9, 0xfc, 0x3, 0x56, 0x4b, 0x1e, 0xe1, 0xb4, 0x2, 0x57, 0xa8, 0xfd, 0xd9, 0x8c, 0x73, 0x26, 0x90, 0xc5, 0x3a, 0x6f, 0xe4, 0xb1, 0x4e, 0x1b, 0xad, 0xf8, 0x7, 0x52, 0x76, 0x23, 0xdc, 0x89, 0x3f, 0x6a, 0x95, 0xc0, 0xdd, 0x88, 0x77, 0x22, 0x94, 0xc1, 0x3e, 0x6b, 0x4f, 0x1a, 0xe5, 0xb0, 0x6, 0x53, 0xac, 0xf9, 0x96, 0xc3, 0x3c, 0x69, 0xdf, 0x8a, 0x75, 0x20, 0x4, 0x51, 0xae, 0xfb, 0x4d, 0x18, 0xe7, 0xb2, 0xaf, 0xfa, 0x5, 0x50, 0xe6, 0xb3, 0x4c, 0x19, 0x3d, 0x68, 0x97, 0xc2, 0x74, 0x21, 0xde, 0x8b, 0xd5, 0x80, 0x7f, 0x2a, 0x9c, 0xc9, 0x36, 0x63, 0x47, 0x12, 0xed, 0xb8, 0xe, 0x5b, 0xa4, 0xf1, 0xec, 0xb9, 0x46, 0x13, 0xa5, 0xf0, 0xf, 0x5a, 0x7e, 0x2b, 0xd4, 0x81, 0x37, 0x62, 0x9d, 0xc8, 0xa7, 0xf2, 0xd, 0x58, 0xee, 0xbb, 0x44, 0x11, 0x35, 0x60, 0x9f, 0xca, 0x7c, 0x29, 0xd6, 0x83, 0x9e, 0xcb, 0x34, 0x61, 0xd7, 0x82, 0x7d, 0x28, 0xc, 0x59, 0xa6, 0xf3, 0x45, 0x10, 0xef, 0xba, 0x31, 0x64, 0x9b, 0xce, 0x78, 0x2d, 0xd2, 0x87, 0xa3, 0xf6, 0x9, 0x5c, 0xea, 0xbf, 0x40, 0x15, 0x8, 0x5d, 0xa2, 0xf7, 0x41, 0x14, 0xeb, 0xbe, 0x9a, 0xcf, 0x30, 0x65, 0xd3, 0x86, 0x79, 0x2c, 0x43, 0x16, 0xe9, 0xbc, 0xa, 0x5f, 0xa0, 0xf5, 0xd1, 0x84, 0x7b, 0x2e, 0x98, 0xcd, 0x32, 0x67, 0x7a, 0x2f, 0xd0, 0x85, 0x33, 0x66, 0x99, 0xcc, 0xe8, 0xbd, 0x42, 0x17, 0xa1, 0xf4, 0xb, 0x5e}, + {0x0, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35, 0x9, 0x5f, 0xa5, 0xf3, 0x4c, 0x1a, 0xe0, 0xb6, 0x83, 0xd5, 0x2f, 0x79, 0xc6, 0x90, 0x6a, 0x3c, 0x12, 0x44, 0xbe, 0xe8, 0x57, 0x1, 0xfb, 0xad, 0x98, 0xce, 0x34, 0x62, 0xdd, 0x8b, 0x71, 0x27, 0x1b, 0x4d, 0xb7, 0xe1, 0x5e, 0x8, 0xf2, 0xa4, 0x91, 0xc7, 0x3d, 0x6b, 0xd4, 0x82, 0x78, 0x2e, 0x24, 0x72, 0x88, 0xde, 0x61, 0x37, 0xcd, 0x9b, 0xae, 0xf8, 0x2, 0x54, 0xeb, 0xbd, 0x47, 0x11, 0x2d, 0x7b, 0x81, 0xd7, 0x68, 0x3e, 0xc4, 0x92, 0xa7, 0xf1, 0xb, 0x5d, 0xe2, 0xb4, 0x4e, 0x18, 0x36, 0x60, 0x9a, 0xcc, 0x73, 0x25, 0xdf, 0x89, 0xbc, 0xea, 0x10, 0x46, 0xf9, 0xaf, 0x55, 0x3, 0x3f, 0x69, 0x93, 0xc5, 0x7a, 0x2c, 0xd6, 0x80, 0xb5, 0xe3, 0x19, 0x4f, 0xf0, 0xa6, 0x5c, 0xa, 0x48, 0x1e, 0xe4, 0xb2, 0xd, 0x5b, 0xa1, 0xf7, 0xc2, 0x94, 0x6e, 0x38, 0x87, 0xd1, 0x2b, 0x7d, 0x41, 0x17, 0xed, 0xbb, 0x4, 0x52, 0xa8, 0xfe, 0xcb, 0x9d, 0x67, 0x31, 0x8e, 0xd8, 0x22, 0x74, 0x5a, 0xc, 0xf6, 0xa0, 0x1f, 0x49, 0xb3, 0xe5, 0xd0, 0x86, 0x7c, 0x2a, 0x95, 0xc3, 0x39, 0x6f, 0x53, 0x5, 0xff, 0xa9, 0x16, 0x40, 0xba, 0xec, 0xd9, 0x8f, 0x75, 0x23, 0x9c, 0xca, 0x30, 0x66, 0x6c, 0x3a, 0xc0, 0x96, 0x29, 0x7f, 0x85, 0xd3, 0xe6, 0xb0, 0x4a, 0x1c, 0xa3, 0xf5, 0xf, 0x59, 0x65, 0x33, 0xc9, 0x9f, 0x20, 0x76, 0x8c, 0xda, 0xef, 0xb9, 0x43, 0x15, 0xaa, 0xfc, 0x6, 0x50, 0x7e, 0x28, 0xd2, 0x84, 0x3b, 0x6d, 0x97, 0xc1, 0xf4, 0xa2, 0x58, 0xe, 0xb1, 0xe7, 0x1d, 0x4b, 0x77, 0x21, 0xdb, 0x8d, 0x32, 0x64, 0x9e, 0xc8, 0xfd, 0xab, 0x51, 0x7, 0xb8, 0xee, 0x14, 0x42}, + {0x0, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a, 0x19, 0x4e, 0xb7, 0xe0, 0x58, 0xf, 0xf6, 0xa1, 0x9b, 0xcc, 0x35, 0x62, 0xda, 0x8d, 0x74, 0x23, 0x32, 0x65, 0x9c, 0xcb, 0x73, 0x24, 0xdd, 0x8a, 0xb0, 0xe7, 0x1e, 0x49, 0xf1, 0xa6, 0x5f, 0x8, 0x2b, 0x7c, 0x85, 0xd2, 0x6a, 0x3d, 0xc4, 0x93, 0xa9, 0xfe, 0x7, 0x50, 0xe8, 0xbf, 0x46, 0x11, 0x64, 0x33, 0xca, 0x9d, 0x25, 0x72, 0x8b, 0xdc, 0xe6, 0xb1, 0x48, 0x1f, 0xa7, 0xf0, 0x9, 0x5e, 0x7d, 0x2a, 0xd3, 0x84, 0x3c, 0x6b, 0x92, 0xc5, 0xff, 0xa8, 0x51, 0x6, 0xbe, 0xe9, 0x10, 0x47, 0x56, 0x1, 0xf8, 0xaf, 0x17, 0x40, 0xb9, 0xee, 0xd4, 0x83, 0x7a, 0x2d, 0x95, 0xc2, 0x3b, 0x6c, 0x4f, 0x18, 0xe1, 0xb6, 0xe, 0x59, 0xa0, 0xf7, 0xcd, 0x9a, 0x63, 0x34, 0x8c, 0xdb, 0x22, 0x75, 0xc8, 0x9f, 0x66, 0x31, 0x89, 0xde, 0x27, 0x70, 0x4a, 0x1d, 0xe4, 0xb3, 0xb, 0x5c, 0xa5, 0xf2, 0xd1, 0x86, 0x7f, 0x28, 0x90, 0xc7, 0x3e, 0x69, 0x53, 0x4, 0xfd, 0xaa, 0x12, 0x45, 0xbc, 0xeb, 0xfa, 0xad, 0x54, 0x3, 0xbb, 0xec, 0x15, 0x42, 0x78, 0x2f, 0xd6, 0x81, 0x39, 0x6e, 0x97, 0xc0, 0xe3, 0xb4, 0x4d, 0x1a, 0xa2, 0xf5, 0xc, 0x5b, 0x61, 0x36, 0xcf, 0x98, 0x20, 0x77, 0x8e, 0xd9, 0xac, 0xfb, 0x2, 0x55, 0xed, 0xba, 0x43, 0x14, 0x2e, 0x79, 0x80, 0xd7, 0x6f, 0x38, 0xc1, 0x96, 0xb5, 0xe2, 0x1b, 0x4c, 0xf4, 0xa3, 0x5a, 0xd, 0x37, 0x60, 0x99, 0xce, 0x76, 0x21, 0xd8, 0x8f, 0x9e, 0xc9, 0x30, 0x67, 0xdf, 0x88, 0x71, 0x26, 0x1c, 0x4b, 0xb2, 0xe5, 0x5d, 0xa, 0xf3, 0xa4, 0x87, 0xd0, 0x29, 0x7e, 0xc6, 0x91, 0x68, 0x3f, 0x5, 0x52, 0xab, 0xfc, 0x44, 0x13, 0xea, 0xbd}, + {0x0, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f, 0xe9, 0xb1, 0x59, 0x1, 0x94, 0xcc, 0x24, 0x7c, 0x13, 0x4b, 0xa3, 0xfb, 0x6e, 0x36, 0xde, 0x86, 0xcf, 0x97, 0x7f, 0x27, 0xb2, 0xea, 0x2, 0x5a, 0x35, 0x6d, 0x85, 0xdd, 0x48, 0x10, 0xf8, 0xa0, 0x26, 0x7e, 0x96, 0xce, 0x5b, 0x3, 0xeb, 0xb3, 0xdc, 0x84, 0x6c, 0x34, 0xa1, 0xf9, 0x11, 0x49, 0x83, 0xdb, 0x33, 0x6b, 0xfe, 0xa6, 0x4e, 0x16, 0x79, 0x21, 0xc9, 0x91, 0x4, 0x5c, 0xb4, 0xec, 0x6a, 0x32, 0xda, 0x82, 0x17, 0x4f, 0xa7, 0xff, 0x90, 0xc8, 0x20, 0x78, 0xed, 0xb5, 0x5d, 0x5, 0x4c, 0x14, 0xfc, 0xa4, 0x31, 0x69, 0x81, 0xd9, 0xb6, 0xee, 0x6, 0x5e, 0xcb, 0x93, 0x7b, 0x23, 0xa5, 0xfd, 0x15, 0x4d, 0xd8, 0x80, 0x68, 0x30, 0x5f, 0x7, 0xef, 0xb7, 0x22, 0x7a, 0x92, 0xca, 0x1b, 0x43, 0xab, 0xf3, 0x66, 0x3e, 0xd6, 0x8e, 0xe1, 0xb9, 0x51, 0x9, 0x9c, 0xc4, 0x2c, 0x74, 0xf2, 0xaa, 0x42, 0x1a, 0x8f, 0xd7, 0x3f, 0x67, 0x8, 0x50, 0xb8, 0xe0, 0x75, 0x2d, 0xc5, 0x9d, 0xd4, 0x8c, 0x64, 0x3c, 0xa9, 0xf1, 0x19, 0x41, 0x2e, 0x76, 0x9e, 0xc6, 0x53, 0xb, 0xe3, 0xbb, 0x3d, 0x65, 0x8d, 0xd5, 0x40, 0x18, 0xf0, 0xa8, 0xc7, 0x9f, 0x77, 0x2f, 0xba, 0xe2, 0xa, 0x52, 0x98, 0xc0, 0x28, 0x70, 0xe5, 0xbd, 0x55, 0xd, 0x62, 0x3a, 0xd2, 0x8a, 0x1f, 0x47, 0xaf, 0xf7, 0x71, 0x29, 0xc1, 0x99, 0xc, 0x54, 0xbc, 0xe4, 0x8b, 0xd3, 0x3b, 0x63, 0xf6, 0xae, 0x46, 0x1e, 0x57, 0xf, 0xe7, 0xbf, 0x2a, 0x72, 0x9a, 0xc2, 0xad, 0xf5, 0x1d, 0x45, 0xd0, 0x88, 0x60, 0x38, 0xbe, 0xe6, 0xe, 0x56, 0xc3, 0x9b, 0x73, 0x2b, 0x44, 0x1c, 0xf4, 0xac, 0x39, 0x61, 0x89, 0xd1}, + {0x0, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60, 0xf9, 0xa0, 0x4b, 0x12, 0x80, 0xd9, 0x32, 0x6b, 0xb, 0x52, 0xb9, 0xe0, 0x72, 0x2b, 0xc0, 0x99, 0xef, 0xb6, 0x5d, 0x4, 0x96, 0xcf, 0x24, 0x7d, 0x1d, 0x44, 0xaf, 0xf6, 0x64, 0x3d, 0xd6, 0x8f, 0x16, 0x4f, 0xa4, 0xfd, 0x6f, 0x36, 0xdd, 0x84, 0xe4, 0xbd, 0x56, 0xf, 0x9d, 0xc4, 0x2f, 0x76, 0xc3, 0x9a, 0x71, 0x28, 0xba, 0xe3, 0x8, 0x51, 0x31, 0x68, 0x83, 0xda, 0x48, 0x11, 0xfa, 0xa3, 0x3a, 0x63, 0x88, 0xd1, 0x43, 0x1a, 0xf1, 0xa8, 0xc8, 0x91, 0x7a, 0x23, 0xb1, 0xe8, 0x3, 0x5a, 0x2c, 0x75, 0x9e, 0xc7, 0x55, 0xc, 0xe7, 0xbe, 0xde, 0x87, 0x6c, 0x35, 0xa7, 0xfe, 0x15, 0x4c, 0xd5, 0x8c, 0x67, 0x3e, 0xac, 0xf5, 0x1e, 0x47, 0x27, 0x7e, 0x95, 0xcc, 0x5e, 0x7, 0xec, 0xb5, 0x9b, 0xc2, 0x29, 0x70, 0xe2, 0xbb, 0x50, 0x9, 0x69, 0x30, 0xdb, 0x82, 0x10, 0x49, 0xa2, 0xfb, 0x62, 0x3b, 0xd0, 0x89, 0x1b, 0x42, 0xa9, 0xf0, 0x90, 0xc9, 0x22, 0x7b, 0xe9, 0xb0, 0x5b, 0x2, 0x74, 0x2d, 0xc6, 0x9f, 0xd, 0x54, 0xbf, 0xe6, 0x86, 0xdf, 0x34, 0x6d, 0xff, 0xa6, 0x4d, 0x14, 0x8d, 0xd4, 0x3f, 0x66, 0xf4, 0xad, 0x46, 0x1f, 0x7f, 0x26, 0xcd, 0x94, 0x6, 0x5f, 0xb4, 0xed, 0x58, 0x1, 0xea, 0xb3, 0x21, 0x78, 0x93, 0xca, 0xaa, 0xf3, 0x18, 0x41, 0xd3, 0x8a, 0x61, 0x38, 0xa1, 0xf8, 0x13, 0x4a, 0xd8, 0x81, 0x6a, 0x33, 0x53, 0xa, 0xe1, 0xb8, 0x2a, 0x73, 0x98, 0xc1, 0xb7, 0xee, 0x5, 0x5c, 0xce, 0x97, 0x7c, 0x25, 0x45, 0x1c, 0xf7, 0xae, 0x3c, 0x65, 0x8e, 0xd7, 0x4e, 0x17, 0xfc, 0xa5, 0x37, 0x6e, 0x85, 0xdc, 0xbc, 0xe5, 0xe, 0x57, 0xc5, 0x9c, 0x77, 0x2e}, + {0x0, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x4, 0x9f, 0xc5, 0x2b, 0x71, 0xc9, 0x93, 0x7d, 0x27, 0xbc, 0xe6, 0x8, 0x52, 0x23, 0x79, 0x97, 0xcd, 0x56, 0xc, 0xe2, 0xb8, 0x8f, 0xd5, 0x3b, 0x61, 0xfa, 0xa0, 0x4e, 0x14, 0x65, 0x3f, 0xd1, 0x8b, 0x10, 0x4a, 0xa4, 0xfe, 0x46, 0x1c, 0xf2, 0xa8, 0x33, 0x69, 0x87, 0xdd, 0xac, 0xf6, 0x18, 0x42, 0xd9, 0x83, 0x6d, 0x37, 0x3, 0x59, 0xb7, 0xed, 0x76, 0x2c, 0xc2, 0x98, 0xe9, 0xb3, 0x5d, 0x7, 0x9c, 0xc6, 0x28, 0x72, 0xca, 0x90, 0x7e, 0x24, 0xbf, 0xe5, 0xb, 0x51, 0x20, 0x7a, 0x94, 0xce, 0x55, 0xf, 0xe1, 0xbb, 0x8c, 0xd6, 0x38, 0x62, 0xf9, 0xa3, 0x4d, 0x17, 0x66, 0x3c, 0xd2, 0x88, 0x13, 0x49, 0xa7, 0xfd, 0x45, 0x1f, 0xf1, 0xab, 0x30, 0x6a, 0x84, 0xde, 0xaf, 0xf5, 0x1b, 0x41, 0xda, 0x80, 0x6e, 0x34, 0x6, 0x5c, 0xb2, 0xe8, 0x73, 0x29, 0xc7, 0x9d, 0xec, 0xb6, 0x58, 0x2, 0x99, 0xc3, 0x2d, 0x77, 0xcf, 0x95, 0x7b, 0x21, 0xba, 0xe0, 0xe, 0x54, 0x25, 0x7f, 0x91, 0xcb, 0x50, 0xa, 0xe4, 0xbe, 0x89, 0xd3, 0x3d, 0x67, 0xfc, 0xa6, 0x48, 0x12, 0x63, 0x39, 0xd7, 0x8d, 0x16, 0x4c, 0xa2, 0xf8, 0x40, 0x1a, 0xf4, 0xae, 0x35, 0x6f, 0x81, 0xdb, 0xaa, 0xf0, 0x1e, 0x44, 0xdf, 0x85, 0x6b, 0x31, 0x5, 0x5f, 0xb1, 0xeb, 0x70, 0x2a, 0xc4, 0x9e, 0xef, 0xb5, 0x5b, 0x1, 0x9a, 0xc0, 0x2e, 0x74, 0xcc, 0x96, 0x78, 0x22, 0xb9, 0xe3, 0xd, 0x57, 0x26, 0x7c, 0x92, 0xc8, 0x53, 0x9, 0xe7, 0xbd, 0x8a, 0xd0, 0x3e, 0x64, 0xff, 0xa5, 0x4b, 0x11, 0x60, 0x3a, 0xd4, 0x8e, 0x15, 0x4f, 0xa1, 0xfb, 0x43, 0x19, 0xf7, 0xad, 0x36, 0x6c, 0x82, 0xd8, 0xa9, 0xf3, 0x1d, 0x47, 0xdc, 0x86, 0x68, 0x32}, + {0x0, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0xf, 0x93, 0xc8, 0x25, 0x7e, 0xd9, 0x82, 0x6f, 0x34, 0xa8, 0xf3, 0x1e, 0x45, 0x3b, 0x60, 0x8d, 0xd6, 0x4a, 0x11, 0xfc, 0xa7, 0xaf, 0xf4, 0x19, 0x42, 0xde, 0x85, 0x68, 0x33, 0x4d, 0x16, 0xfb, 0xa0, 0x3c, 0x67, 0x8a, 0xd1, 0x76, 0x2d, 0xc0, 0x9b, 0x7, 0x5c, 0xb1, 0xea, 0x94, 0xcf, 0x22, 0x79, 0xe5, 0xbe, 0x53, 0x8, 0x43, 0x18, 0xf5, 0xae, 0x32, 0x69, 0x84, 0xdf, 0xa1, 0xfa, 0x17, 0x4c, 0xd0, 0x8b, 0x66, 0x3d, 0x9a, 0xc1, 0x2c, 0x77, 0xeb, 0xb0, 0x5d, 0x6, 0x78, 0x23, 0xce, 0x95, 0x9, 0x52, 0xbf, 0xe4, 0xec, 0xb7, 0x5a, 0x1, 0x9d, 0xc6, 0x2b, 0x70, 0xe, 0x55, 0xb8, 0xe3, 0x7f, 0x24, 0xc9, 0x92, 0x35, 0x6e, 0x83, 0xd8, 0x44, 0x1f, 0xf2, 0xa9, 0xd7, 0x8c, 0x61, 0x3a, 0xa6, 0xfd, 0x10, 0x4b, 0x86, 0xdd, 0x30, 0x6b, 0xf7, 0xac, 0x41, 0x1a, 0x64, 0x3f, 0xd2, 0x89, 0x15, 0x4e, 0xa3, 0xf8, 0x5f, 0x4, 0xe9, 0xb2, 0x2e, 0x75, 0x98, 0xc3, 0xbd, 0xe6, 0xb, 0x50, 0xcc, 0x97, 0x7a, 0x21, 0x29, 0x72, 0x9f, 0xc4, 0x58, 0x3, 0xee, 0xb5, 0xcb, 0x90, 0x7d, 0x26, 0xba, 0xe1, 0xc, 0x57, 0xf0, 0xab, 0x46, 0x1d, 0x81, 0xda, 0x37, 0x6c, 0x12, 0x49, 0xa4, 0xff, 0x63, 0x38, 0xd5, 0x8e, 0xc5, 0x9e, 0x73, 0x28, 0xb4, 0xef, 0x2, 0x59, 0x27, 0x7c, 0x91, 0xca, 0x56, 0xd, 0xe0, 0xbb, 0x1c, 0x47, 0xaa, 0xf1, 0x6d, 0x36, 0xdb, 0x80, 0xfe, 0xa5, 0x48, 0x13, 0x8f, 0xd4, 0x39, 0x62, 0x6a, 0x31, 0xdc, 0x87, 0x1b, 0x40, 0xad, 0xf6, 0x88, 0xd3, 0x3e, 0x65, 0xf9, 0xa2, 0x4f, 0x14, 0xb3, 0xe8, 0x5, 0x5e, 0xc2, 0x99, 0x74, 0x2f, 0x51, 0xa, 0xe7, 0xbc, 0x20, 0x7b, 0x96, 0xcd}, + {0x0, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0xf, 0x53, 0xa9, 0xf5, 0x11, 0x4d, 0xc4, 0x98, 0x7c, 0x20, 0x73, 0x2f, 0xcb, 0x97, 0x1e, 0x42, 0xa6, 0xfa, 0x4f, 0x13, 0xf7, 0xab, 0x22, 0x7e, 0x9a, 0xc6, 0x95, 0xc9, 0x2d, 0x71, 0xf8, 0xa4, 0x40, 0x1c, 0xe6, 0xba, 0x5e, 0x2, 0x8b, 0xd7, 0x33, 0x6f, 0x3c, 0x60, 0x84, 0xd8, 0x51, 0xd, 0xe9, 0xb5, 0x9e, 0xc2, 0x26, 0x7a, 0xf3, 0xaf, 0x4b, 0x17, 0x44, 0x18, 0xfc, 0xa0, 0x29, 0x75, 0x91, 0xcd, 0x37, 0x6b, 0x8f, 0xd3, 0x5a, 0x6, 0xe2, 0xbe, 0xed, 0xb1, 0x55, 0x9, 0x80, 0xdc, 0x38, 0x64, 0xd1, 0x8d, 0x69, 0x35, 0xbc, 0xe0, 0x4, 0x58, 0xb, 0x57, 0xb3, 0xef, 0x66, 0x3a, 0xde, 0x82, 0x78, 0x24, 0xc0, 0x9c, 0x15, 0x49, 0xad, 0xf1, 0xa2, 0xfe, 0x1a, 0x46, 0xcf, 0x93, 0x77, 0x2b, 0x21, 0x7d, 0x99, 0xc5, 0x4c, 0x10, 0xf4, 0xa8, 0xfb, 0xa7, 0x43, 0x1f, 0x96, 0xca, 0x2e, 0x72, 0x88, 0xd4, 0x30, 0x6c, 0xe5, 0xb9, 0x5d, 0x1, 0x52, 0xe, 0xea, 0xb6, 0x3f, 0x63, 0x87, 0xdb, 0x6e, 0x32, 0xd6, 0x8a, 0x3, 0x5f, 0xbb, 0xe7, 0xb4, 0xe8, 0xc, 0x50, 0xd9, 0x85, 0x61, 0x3d, 0xc7, 0x9b, 0x7f, 0x23, 0xaa, 0xf6, 0x12, 0x4e, 0x1d, 0x41, 0xa5, 0xf9, 0x70, 0x2c, 0xc8, 0x94, 0xbf, 0xe3, 0x7, 0x5b, 0xd2, 0x8e, 0x6a, 0x36, 0x65, 0x39, 0xdd, 0x81, 0x8, 0x54, 0xb0, 0xec, 0x16, 0x4a, 0xae, 0xf2, 0x7b, 0x27, 0xc3, 0x9f, 0xcc, 0x90, 0x74, 0x28, 0xa1, 0xfd, 0x19, 0x45, 0xf0, 0xac, 0x48, 0x14, 0x9d, 0xc1, 0x25, 0x79, 0x2a, 0x76, 0x92, 0xce, 0x47, 0x1b, 0xff, 0xa3, 0x59, 0x5, 0xe1, 0xbd, 0x34, 0x68, 0x8c, 0xd0, 0x83, 0xdf, 0x3b, 0x67, 0xee, 0xb2, 0x56, 0xa}, + {0x0, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x1, 0x5c, 0xb9, 0xe4, 0x3, 0x5e, 0xd0, 0x8d, 0x6a, 0x37, 0x6b, 0x36, 0xd1, 0x8c, 0x2, 0x5f, 0xb8, 0xe5, 0x6f, 0x32, 0xd5, 0x88, 0x6, 0x5b, 0xbc, 0xe1, 0xbd, 0xe0, 0x7, 0x5a, 0xd4, 0x89, 0x6e, 0x33, 0xd6, 0x8b, 0x6c, 0x31, 0xbf, 0xe2, 0x5, 0x58, 0x4, 0x59, 0xbe, 0xe3, 0x6d, 0x30, 0xd7, 0x8a, 0xde, 0x83, 0x64, 0x39, 0xb7, 0xea, 0xd, 0x50, 0xc, 0x51, 0xb6, 0xeb, 0x65, 0x38, 0xdf, 0x82, 0x67, 0x3a, 0xdd, 0x80, 0xe, 0x53, 0xb4, 0xe9, 0xb5, 0xe8, 0xf, 0x52, 0xdc, 0x81, 0x66, 0x3b, 0xb1, 0xec, 0xb, 0x56, 0xd8, 0x85, 0x62, 0x3f, 0x63, 0x3e, 0xd9, 0x84, 0xa, 0x57, 0xb0, 0xed, 0x8, 0x55, 0xb2, 0xef, 0x61, 0x3c, 0xdb, 0x86, 0xda, 0x87, 0x60, 0x3d, 0xb3, 0xee, 0x9, 0x54, 0xa1, 0xfc, 0x1b, 0x46, 0xc8, 0x95, 0x72, 0x2f, 0x73, 0x2e, 0xc9, 0x94, 0x1a, 0x47, 0xa0, 0xfd, 0x18, 0x45, 0xa2, 0xff, 0x71, 0x2c, 0xcb, 0x96, 0xca, 0x97, 0x70, 0x2d, 0xa3, 0xfe, 0x19, 0x44, 0xce, 0x93, 0x74, 0x29, 0xa7, 0xfa, 0x1d, 0x40, 0x1c, 0x41, 0xa6, 0xfb, 0x75, 0x28, 0xcf, 0x92, 0x77, 0x2a, 0xcd, 0x90, 0x1e, 0x43, 0xa4, 0xf9, 0xa5, 0xf8, 0x1f, 0x42, 0xcc, 0x91, 0x76, 0x2b, 0x7f, 0x22, 0xc5, 0x98, 0x16, 0x4b, 0xac, 0xf1, 0xad, 0xf0, 0x17, 0x4a, 0xc4, 0x99, 0x7e, 0x23, 0xc6, 0x9b, 0x7c, 0x21, 0xaf, 0xf2, 0x15, 0x48, 0x14, 0x49, 0xae, 0xf3, 0x7d, 0x20, 0xc7, 0x9a, 0x10, 0x4d, 0xaa, 0xf7, 0x79, 0x24, 0xc3, 0x9e, 0xc2, 0x9f, 0x78, 0x25, 0xab, 0xf6, 0x11, 0x4c, 0xa9, 0xf4, 0x13, 0x4e, 0xc0, 0x9d, 0x7a, 0x27, 0x7b, 0x26, 0xc1, 0x9c, 0x12, 0x4f, 0xa8, 0xf5}, + {0x0, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d, 0x89, 0xd7, 0x35, 0x6b, 0xec, 0xb2, 0x50, 0xe, 0x43, 0x1d, 0xff, 0xa1, 0x26, 0x78, 0x9a, 0xc4, 0xf, 0x51, 0xb3, 0xed, 0x6a, 0x34, 0xd6, 0x88, 0xc5, 0x9b, 0x79, 0x27, 0xa0, 0xfe, 0x1c, 0x42, 0x86, 0xd8, 0x3a, 0x64, 0xe3, 0xbd, 0x5f, 0x1, 0x4c, 0x12, 0xf0, 0xae, 0x29, 0x77, 0x95, 0xcb, 0x1e, 0x40, 0xa2, 0xfc, 0x7b, 0x25, 0xc7, 0x99, 0xd4, 0x8a, 0x68, 0x36, 0xb1, 0xef, 0xd, 0x53, 0x97, 0xc9, 0x2b, 0x75, 0xf2, 0xac, 0x4e, 0x10, 0x5d, 0x3, 0xe1, 0xbf, 0x38, 0x66, 0x84, 0xda, 0x11, 0x4f, 0xad, 0xf3, 0x74, 0x2a, 0xc8, 0x96, 0xdb, 0x85, 0x67, 0x39, 0xbe, 0xe0, 0x2, 0x5c, 0x98, 0xc6, 0x24, 0x7a, 0xfd, 0xa3, 0x41, 0x1f, 0x52, 0xc, 0xee, 0xb0, 0x37, 0x69, 0x8b, 0xd5, 0x3c, 0x62, 0x80, 0xde, 0x59, 0x7, 0xe5, 0xbb, 0xf6, 0xa8, 0x4a, 0x14, 0x93, 0xcd, 0x2f, 0x71, 0xb5, 0xeb, 0x9, 0x57, 0xd0, 0x8e, 0x6c, 0x32, 0x7f, 0x21, 0xc3, 0x9d, 0x1a, 0x44, 0xa6, 0xf8, 0x33, 0x6d, 0x8f, 0xd1, 0x56, 0x8, 0xea, 0xb4, 0xf9, 0xa7, 0x45, 0x1b, 0x9c, 0xc2, 0x20, 0x7e, 0xba, 0xe4, 0x6, 0x58, 0xdf, 0x81, 0x63, 0x3d, 0x70, 0x2e, 0xcc, 0x92, 0x15, 0x4b, 0xa9, 0xf7, 0x22, 0x7c, 0x9e, 0xc0, 0x47, 0x19, 0xfb, 0xa5, 0xe8, 0xb6, 0x54, 0xa, 0x8d, 0xd3, 0x31, 0x6f, 0xab, 0xf5, 0x17, 0x49, 0xce, 0x90, 0x72, 0x2c, 0x61, 0x3f, 0xdd, 0x83, 0x4, 0x5a, 0xb8, 0xe6, 0x2d, 0x73, 0x91, 0xcf, 0x48, 0x16, 0xf4, 0xaa, 0xe7, 0xb9, 0x5b, 0x5, 0x82, 0xdc, 0x3e, 0x60, 0xa4, 0xfa, 0x18, 0x46, 0xc1, 0x9f, 0x7d, 0x23, 0x6e, 0x30, 0xd2, 0x8c, 0xb, 0x55, 0xb7, 0xe9}, + {0x0, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42, 0x99, 0xc6, 0x27, 0x78, 0xf8, 0xa7, 0x46, 0x19, 0x5b, 0x4, 0xe5, 0xba, 0x3a, 0x65, 0x84, 0xdb, 0x2f, 0x70, 0x91, 0xce, 0x4e, 0x11, 0xf0, 0xaf, 0xed, 0xb2, 0x53, 0xc, 0x8c, 0xd3, 0x32, 0x6d, 0xb6, 0xe9, 0x8, 0x57, 0xd7, 0x88, 0x69, 0x36, 0x74, 0x2b, 0xca, 0x95, 0x15, 0x4a, 0xab, 0xf4, 0x5e, 0x1, 0xe0, 0xbf, 0x3f, 0x60, 0x81, 0xde, 0x9c, 0xc3, 0x22, 0x7d, 0xfd, 0xa2, 0x43, 0x1c, 0xc7, 0x98, 0x79, 0x26, 0xa6, 0xf9, 0x18, 0x47, 0x5, 0x5a, 0xbb, 0xe4, 0x64, 0x3b, 0xda, 0x85, 0x71, 0x2e, 0xcf, 0x90, 0x10, 0x4f, 0xae, 0xf1, 0xb3, 0xec, 0xd, 0x52, 0xd2, 0x8d, 0x6c, 0x33, 0xe8, 0xb7, 0x56, 0x9, 0x89, 0xd6, 0x37, 0x68, 0x2a, 0x75, 0x94, 0xcb, 0x4b, 0x14, 0xf5, 0xaa, 0xbc, 0xe3, 0x2, 0x5d, 0xdd, 0x82, 0x63, 0x3c, 0x7e, 0x21, 0xc0, 0x9f, 0x1f, 0x40, 0xa1, 0xfe, 0x25, 0x7a, 0x9b, 0xc4, 0x44, 0x1b, 0xfa, 0xa5, 0xe7, 0xb8, 0x59, 0x6, 0x86, 0xd9, 0x38, 0x67, 0x93, 0xcc, 0x2d, 0x72, 0xf2, 0xad, 0x4c, 0x13, 0x51, 0xe, 0xef, 0xb0, 0x30, 0x6f, 0x8e, 0xd1, 0xa, 0x55, 0xb4, 0xeb, 0x6b, 0x34, 0xd5, 0x8a, 0xc8, 0x97, 0x76, 0x29, 0xa9, 0xf6, 0x17, 0x48, 0xe2, 0xbd, 0x5c, 0x3, 0x83, 0xdc, 0x3d, 0x62, 0x20, 0x7f, 0x9e, 0xc1, 0x41, 0x1e, 0xff, 0xa0, 0x7b, 0x24, 0xc5, 0x9a, 0x1a, 0x45, 0xa4, 0xfb, 0xb9, 0xe6, 0x7, 0x58, 0xd8, 0x87, 0x66, 0x39, 0xcd, 0x92, 0x73, 0x2c, 0xac, 0xf3, 0x12, 0x4d, 0xf, 0x50, 0xb1, 0xee, 0x6e, 0x31, 0xd0, 0x8f, 0x54, 0xb, 0xea, 0xb5, 0x35, 0x6a, 0x8b, 0xd4, 0x96, 0xc9, 0x28, 0x77, 0xf7, 0xa8, 0x49, 0x16}, + {0x0, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a, 0x4e, 0x2e, 0x8e, 0xee, 0xd3, 0xb3, 0x13, 0x73, 0x69, 0x9, 0xa9, 0xc9, 0xf4, 0x94, 0x34, 0x54, 0x9c, 0xfc, 0x5c, 0x3c, 0x1, 0x61, 0xc1, 0xa1, 0xbb, 0xdb, 0x7b, 0x1b, 0x26, 0x46, 0xe6, 0x86, 0xd2, 0xb2, 0x12, 0x72, 0x4f, 0x2f, 0x8f, 0xef, 0xf5, 0x95, 0x35, 0x55, 0x68, 0x8, 0xa8, 0xc8, 0x25, 0x45, 0xe5, 0x85, 0xb8, 0xd8, 0x78, 0x18, 0x2, 0x62, 0xc2, 0xa2, 0x9f, 0xff, 0x5f, 0x3f, 0x6b, 0xb, 0xab, 0xcb, 0xf6, 0x96, 0x36, 0x56, 0x4c, 0x2c, 0x8c, 0xec, 0xd1, 0xb1, 0x11, 0x71, 0xb9, 0xd9, 0x79, 0x19, 0x24, 0x44, 0xe4, 0x84, 0x9e, 0xfe, 0x5e, 0x3e, 0x3, 0x63, 0xc3, 0xa3, 0xf7, 0x97, 0x37, 0x57, 0x6a, 0xa, 0xaa, 0xca, 0xd0, 0xb0, 0x10, 0x70, 0x4d, 0x2d, 0x8d, 0xed, 0x4a, 0x2a, 0x8a, 0xea, 0xd7, 0xb7, 0x17, 0x77, 0x6d, 0xd, 0xad, 0xcd, 0xf0, 0x90, 0x30, 0x50, 0x4, 0x64, 0xc4, 0xa4, 0x99, 0xf9, 0x59, 0x39, 0x23, 0x43, 0xe3, 0x83, 0xbe, 0xde, 0x7e, 0x1e, 0xd6, 0xb6, 0x16, 0x76, 0x4b, 0x2b, 0x8b, 0xeb, 0xf1, 0x91, 0x31, 0x51, 0x6c, 0xc, 0xac, 0xcc, 0x98, 0xf8, 0x58, 0x38, 0x5, 0x65, 0xc5, 0xa5, 0xbf, 0xdf, 0x7f, 0x1f, 0x22, 0x42, 0xe2, 0x82, 0x6f, 0xf, 0xaf, 0xcf, 0xf2, 0x92, 0x32, 0x52, 0x48, 0x28, 0x88, 0xe8, 0xd5, 0xb5, 0x15, 0x75, 0x21, 0x41, 0xe1, 0x81, 0xbc, 0xdc, 0x7c, 0x1c, 0x6, 0x66, 0xc6, 0xa6, 0x9b, 0xfb, 0x5b, 0x3b, 0xf3, 0x93, 0x33, 0x53, 0x6e, 0xe, 0xae, 0xce, 0xd4, 0xb4, 0x14, 0x74, 0x49, 0x29, 0x89, 0xe9, 0xbd, 0xdd, 0x7d, 0x1d, 0x20, 0x40, 0xe0, 0x80, 0x9a, 0xfa, 0x5a, 0x3a, 0x7, 0x67, 0xc7, 0xa7}, + {0x0, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15, 0x5e, 0x3f, 0x9c, 0xfd, 0xc7, 0xa6, 0x5, 0x64, 0x71, 0x10, 0xb3, 0xd2, 0xe8, 0x89, 0x2a, 0x4b, 0xbc, 0xdd, 0x7e, 0x1f, 0x25, 0x44, 0xe7, 0x86, 0x93, 0xf2, 0x51, 0x30, 0xa, 0x6b, 0xc8, 0xa9, 0xe2, 0x83, 0x20, 0x41, 0x7b, 0x1a, 0xb9, 0xd8, 0xcd, 0xac, 0xf, 0x6e, 0x54, 0x35, 0x96, 0xf7, 0x65, 0x4, 0xa7, 0xc6, 0xfc, 0x9d, 0x3e, 0x5f, 0x4a, 0x2b, 0x88, 0xe9, 0xd3, 0xb2, 0x11, 0x70, 0x3b, 0x5a, 0xf9, 0x98, 0xa2, 0xc3, 0x60, 0x1, 0x14, 0x75, 0xd6, 0xb7, 0x8d, 0xec, 0x4f, 0x2e, 0xd9, 0xb8, 0x1b, 0x7a, 0x40, 0x21, 0x82, 0xe3, 0xf6, 0x97, 0x34, 0x55, 0x6f, 0xe, 0xad, 0xcc, 0x87, 0xe6, 0x45, 0x24, 0x1e, 0x7f, 0xdc, 0xbd, 0xa8, 0xc9, 0x6a, 0xb, 0x31, 0x50, 0xf3, 0x92, 0xca, 0xab, 0x8, 0x69, 0x53, 0x32, 0x91, 0xf0, 0xe5, 0x84, 0x27, 0x46, 0x7c, 0x1d, 0xbe, 0xdf, 0x94, 0xf5, 0x56, 0x37, 0xd, 0x6c, 0xcf, 0xae, 0xbb, 0xda, 0x79, 0x18, 0x22, 0x43, 0xe0, 0x81, 0x76, 0x17, 0xb4, 0xd5, 0xef, 0x8e, 0x2d, 0x4c, 0x59, 0x38, 0x9b, 0xfa, 0xc0, 0xa1, 0x2, 0x63, 0x28, 0x49, 0xea, 0x8b, 0xb1, 0xd0, 0x73, 0x12, 0x7, 0x66, 0xc5, 0xa4, 0x9e, 0xff, 0x5c, 0x3d, 0xaf, 0xce, 0x6d, 0xc, 0x36, 0x57, 0xf4, 0x95, 0x80, 0xe1, 0x42, 0x23, 0x19, 0x78, 0xdb, 0xba, 0xf1, 0x90, 0x33, 0x52, 0x68, 0x9, 0xaa, 0xcb, 0xde, 0xbf, 0x1c, 0x7d, 0x47, 0x26, 0x85, 0xe4, 0x13, 0x72, 0xd1, 0xb0, 0x8a, 0xeb, 0x48, 0x29, 0x3c, 0x5d, 0xfe, 0x9f, 0xa5, 0xc4, 0x67, 0x6, 0x4d, 0x2c, 0x8f, 0xee, 0xd4, 0xb5, 0x16, 0x77, 0x62, 0x3, 0xa0, 0xc1, 0xfb, 0x9a, 0x39, 0x58}, + {0x0, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x4, 0x6e, 0xc, 0xaa, 0xc8, 0xfb, 0x99, 0x3f, 0x5d, 0x59, 0x3b, 0x9d, 0xff, 0xcc, 0xae, 0x8, 0x6a, 0xdc, 0xbe, 0x18, 0x7a, 0x49, 0x2b, 0x8d, 0xef, 0xeb, 0x89, 0x2f, 0x4d, 0x7e, 0x1c, 0xba, 0xd8, 0xb2, 0xd0, 0x76, 0x14, 0x27, 0x45, 0xe3, 0x81, 0x85, 0xe7, 0x41, 0x23, 0x10, 0x72, 0xd4, 0xb6, 0xa5, 0xc7, 0x61, 0x3, 0x30, 0x52, 0xf4, 0x96, 0x92, 0xf0, 0x56, 0x34, 0x7, 0x65, 0xc3, 0xa1, 0xcb, 0xa9, 0xf, 0x6d, 0x5e, 0x3c, 0x9a, 0xf8, 0xfc, 0x9e, 0x38, 0x5a, 0x69, 0xb, 0xad, 0xcf, 0x79, 0x1b, 0xbd, 0xdf, 0xec, 0x8e, 0x28, 0x4a, 0x4e, 0x2c, 0x8a, 0xe8, 0xdb, 0xb9, 0x1f, 0x7d, 0x17, 0x75, 0xd3, 0xb1, 0x82, 0xe0, 0x46, 0x24, 0x20, 0x42, 0xe4, 0x86, 0xb5, 0xd7, 0x71, 0x13, 0x57, 0x35, 0x93, 0xf1, 0xc2, 0xa0, 0x6, 0x64, 0x60, 0x2, 0xa4, 0xc6, 0xf5, 0x97, 0x31, 0x53, 0x39, 0x5b, 0xfd, 0x9f, 0xac, 0xce, 0x68, 0xa, 0xe, 0x6c, 0xca, 0xa8, 0x9b, 0xf9, 0x5f, 0x3d, 0x8b, 0xe9, 0x4f, 0x2d, 0x1e, 0x7c, 0xda, 0xb8, 0xbc, 0xde, 0x78, 0x1a, 0x29, 0x4b, 0xed, 0x8f, 0xe5, 0x87, 0x21, 0x43, 0x70, 0x12, 0xb4, 0xd6, 0xd2, 0xb0, 0x16, 0x74, 0x47, 0x25, 0x83, 0xe1, 0xf2, 0x90, 0x36, 0x54, 0x67, 0x5, 0xa3, 0xc1, 0xc5, 0xa7, 0x1, 0x63, 0x50, 0x32, 0x94, 0xf6, 0x9c, 0xfe, 0x58, 0x3a, 0x9, 0x6b, 0xcd, 0xaf, 0xab, 0xc9, 0x6f, 0xd, 0x3e, 0x5c, 0xfa, 0x98, 0x2e, 0x4c, 0xea, 0x88, 0xbb, 0xd9, 0x7f, 0x1d, 0x19, 0x7b, 0xdd, 0xbf, 0x8c, 0xee, 0x48, 0x2a, 0x40, 0x22, 0x84, 0xe6, 0xd5, 0xb7, 0x11, 0x73, 0x77, 0x15, 0xb3, 0xd1, 0xe2, 0x80, 0x26, 0x44}, + {0x0, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0xb, 0x7e, 0x1d, 0xb8, 0xdb, 0xef, 0x8c, 0x29, 0x4a, 0x41, 0x22, 0x87, 0xe4, 0xd0, 0xb3, 0x16, 0x75, 0xfc, 0x9f, 0x3a, 0x59, 0x6d, 0xe, 0xab, 0xc8, 0xc3, 0xa0, 0x5, 0x66, 0x52, 0x31, 0x94, 0xf7, 0x82, 0xe1, 0x44, 0x27, 0x13, 0x70, 0xd5, 0xb6, 0xbd, 0xde, 0x7b, 0x18, 0x2c, 0x4f, 0xea, 0x89, 0xe5, 0x86, 0x23, 0x40, 0x74, 0x17, 0xb2, 0xd1, 0xda, 0xb9, 0x1c, 0x7f, 0x4b, 0x28, 0x8d, 0xee, 0x9b, 0xf8, 0x5d, 0x3e, 0xa, 0x69, 0xcc, 0xaf, 0xa4, 0xc7, 0x62, 0x1, 0x35, 0x56, 0xf3, 0x90, 0x19, 0x7a, 0xdf, 0xbc, 0x88, 0xeb, 0x4e, 0x2d, 0x26, 0x45, 0xe0, 0x83, 0xb7, 0xd4, 0x71, 0x12, 0x67, 0x4, 0xa1, 0xc2, 0xf6, 0x95, 0x30, 0x53, 0x58, 0x3b, 0x9e, 0xfd, 0xc9, 0xaa, 0xf, 0x6c, 0xd7, 0xb4, 0x11, 0x72, 0x46, 0x25, 0x80, 0xe3, 0xe8, 0x8b, 0x2e, 0x4d, 0x79, 0x1a, 0xbf, 0xdc, 0xa9, 0xca, 0x6f, 0xc, 0x38, 0x5b, 0xfe, 0x9d, 0x96, 0xf5, 0x50, 0x33, 0x7, 0x64, 0xc1, 0xa2, 0x2b, 0x48, 0xed, 0x8e, 0xba, 0xd9, 0x7c, 0x1f, 0x14, 0x77, 0xd2, 0xb1, 0x85, 0xe6, 0x43, 0x20, 0x55, 0x36, 0x93, 0xf0, 0xc4, 0xa7, 0x2, 0x61, 0x6a, 0x9, 0xac, 0xcf, 0xfb, 0x98, 0x3d, 0x5e, 0x32, 0x51, 0xf4, 0x97, 0xa3, 0xc0, 0x65, 0x6, 0xd, 0x6e, 0xcb, 0xa8, 0x9c, 0xff, 0x5a, 0x39, 0x4c, 0x2f, 0x8a, 0xe9, 0xdd, 0xbe, 0x1b, 0x78, 0x73, 0x10, 0xb5, 0xd6, 0xe2, 0x81, 0x24, 0x47, 0xce, 0xad, 0x8, 0x6b, 0x5f, 0x3c, 0x99, 0xfa, 0xf1, 0x92, 0x37, 0x54, 0x60, 0x3, 0xa6, 0xc5, 0xb0, 0xd3, 0x76, 0x15, 0x21, 0x42, 0xe7, 0x84, 0x8f, 0xec, 0x49, 0x2a, 0x1e, 0x7d, 0xd8, 0xbb}, + {0x0, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x7, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26, 0xe, 0x6a, 0xc6, 0xa2, 0x83, 0xe7, 0x4b, 0x2f, 0x9, 0x6d, 0xc1, 0xa5, 0x84, 0xe0, 0x4c, 0x28, 0x1c, 0x78, 0xd4, 0xb0, 0x91, 0xf5, 0x59, 0x3d, 0x1b, 0x7f, 0xd3, 0xb7, 0x96, 0xf2, 0x5e, 0x3a, 0x12, 0x76, 0xda, 0xbe, 0x9f, 0xfb, 0x57, 0x33, 0x15, 0x71, 0xdd, 0xb9, 0x98, 0xfc, 0x50, 0x34, 0x38, 0x5c, 0xf0, 0x94, 0xb5, 0xd1, 0x7d, 0x19, 0x3f, 0x5b, 0xf7, 0x93, 0xb2, 0xd6, 0x7a, 0x1e, 0x36, 0x52, 0xfe, 0x9a, 0xbb, 0xdf, 0x73, 0x17, 0x31, 0x55, 0xf9, 0x9d, 0xbc, 0xd8, 0x74, 0x10, 0x24, 0x40, 0xec, 0x88, 0xa9, 0xcd, 0x61, 0x5, 0x23, 0x47, 0xeb, 0x8f, 0xae, 0xca, 0x66, 0x2, 0x2a, 0x4e, 0xe2, 0x86, 0xa7, 0xc3, 0x6f, 0xb, 0x2d, 0x49, 0xe5, 0x81, 0xa0, 0xc4, 0x68, 0xc, 0x70, 0x14, 0xb8, 0xdc, 0xfd, 0x99, 0x35, 0x51, 0x77, 0x13, 0xbf, 0xdb, 0xfa, 0x9e, 0x32, 0x56, 0x7e, 0x1a, 0xb6, 0xd2, 0xf3, 0x97, 0x3b, 0x5f, 0x79, 0x1d, 0xb1, 0xd5, 0xf4, 0x90, 0x3c, 0x58, 0x6c, 0x8, 0xa4, 0xc0, 0xe1, 0x85, 0x29, 0x4d, 0x6b, 0xf, 0xa3, 0xc7, 0xe6, 0x82, 0x2e, 0x4a, 0x62, 0x6, 0xaa, 0xce, 0xef, 0x8b, 0x27, 0x43, 0x65, 0x1, 0xad, 0xc9, 0xe8, 0x8c, 0x20, 0x44, 0x48, 0x2c, 0x80, 0xe4, 0xc5, 0xa1, 0xd, 0x69, 0x4f, 0x2b, 0x87, 0xe3, 0xc2, 0xa6, 0xa, 0x6e, 0x46, 0x22, 0x8e, 0xea, 0xcb, 0xaf, 0x3, 0x67, 0x41, 0x25, 0x89, 0xed, 0xcc, 0xa8, 0x4, 0x60, 0x54, 0x30, 0x9c, 0xf8, 0xd9, 0xbd, 0x11, 0x75, 0x53, 0x37, 0x9b, 0xff, 0xde, 0xba, 0x16, 0x72, 0x5a, 0x3e, 0x92, 0xf6, 0xd7, 0xb3, 0x1f, 0x7b, 0x5d, 0x39, 0x95, 0xf1, 0xd0, 0xb4, 0x18, 0x7c}, + {0x0, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0xf, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29, 0x1e, 0x7b, 0xd4, 0xb1, 0x97, 0xf2, 0x5d, 0x38, 0x11, 0x74, 0xdb, 0xbe, 0x98, 0xfd, 0x52, 0x37, 0x3c, 0x59, 0xf6, 0x93, 0xb5, 0xd0, 0x7f, 0x1a, 0x33, 0x56, 0xf9, 0x9c, 0xba, 0xdf, 0x70, 0x15, 0x22, 0x47, 0xe8, 0x8d, 0xab, 0xce, 0x61, 0x4, 0x2d, 0x48, 0xe7, 0x82, 0xa4, 0xc1, 0x6e, 0xb, 0x78, 0x1d, 0xb2, 0xd7, 0xf1, 0x94, 0x3b, 0x5e, 0x77, 0x12, 0xbd, 0xd8, 0xfe, 0x9b, 0x34, 0x51, 0x66, 0x3, 0xac, 0xc9, 0xef, 0x8a, 0x25, 0x40, 0x69, 0xc, 0xa3, 0xc6, 0xe0, 0x85, 0x2a, 0x4f, 0x44, 0x21, 0x8e, 0xeb, 0xcd, 0xa8, 0x7, 0x62, 0x4b, 0x2e, 0x81, 0xe4, 0xc2, 0xa7, 0x8, 0x6d, 0x5a, 0x3f, 0x90, 0xf5, 0xd3, 0xb6, 0x19, 0x7c, 0x55, 0x30, 0x9f, 0xfa, 0xdc, 0xb9, 0x16, 0x73, 0xf0, 0x95, 0x3a, 0x5f, 0x79, 0x1c, 0xb3, 0xd6, 0xff, 0x9a, 0x35, 0x50, 0x76, 0x13, 0xbc, 0xd9, 0xee, 0x8b, 0x24, 0x41, 0x67, 0x2, 0xad, 0xc8, 0xe1, 0x84, 0x2b, 0x4e, 0x68, 0xd, 0xa2, 0xc7, 0xcc, 0xa9, 0x6, 0x63, 0x45, 0x20, 0x8f, 0xea, 0xc3, 0xa6, 0x9, 0x6c, 0x4a, 0x2f, 0x80, 0xe5, 0xd2, 0xb7, 0x18, 0x7d, 0x5b, 0x3e, 0x91, 0xf4, 0xdd, 0xb8, 0x17, 0x72, 0x54, 0x31, 0x9e, 0xfb, 0x88, 0xed, 0x42, 0x27, 0x1, 0x64, 0xcb, 0xae, 0x87, 0xe2, 0x4d, 0x28, 0xe, 0x6b, 0xc4, 0xa1, 0x96, 0xf3, 0x5c, 0x39, 0x1f, 0x7a, 0xd5, 0xb0, 0x99, 0xfc, 0x53, 0x36, 0x10, 0x75, 0xda, 0xbf, 0xb4, 0xd1, 0x7e, 0x1b, 0x3d, 0x58, 0xf7, 0x92, 0xbb, 0xde, 0x71, 0x14, 0x32, 0x57, 0xf8, 0x9d, 0xaa, 0xcf, 0x60, 0x5, 0x23, 0x46, 0xe9, 0x8c, 0xa5, 0xc0, 0x6f, 0xa, 0x2c, 0x49, 0xe6, 0x83}, + {0x0, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38, 0x2e, 0x48, 0xe2, 0x84, 0xab, 0xcd, 0x67, 0x1, 0x39, 0x5f, 0xf5, 0x93, 0xbc, 0xda, 0x70, 0x16, 0x5c, 0x3a, 0x90, 0xf6, 0xd9, 0xbf, 0x15, 0x73, 0x4b, 0x2d, 0x87, 0xe1, 0xce, 0xa8, 0x2, 0x64, 0x72, 0x14, 0xbe, 0xd8, 0xf7, 0x91, 0x3b, 0x5d, 0x65, 0x3, 0xa9, 0xcf, 0xe0, 0x86, 0x2c, 0x4a, 0xb8, 0xde, 0x74, 0x12, 0x3d, 0x5b, 0xf1, 0x97, 0xaf, 0xc9, 0x63, 0x5, 0x2a, 0x4c, 0xe6, 0x80, 0x96, 0xf0, 0x5a, 0x3c, 0x13, 0x75, 0xdf, 0xb9, 0x81, 0xe7, 0x4d, 0x2b, 0x4, 0x62, 0xc8, 0xae, 0xe4, 0x82, 0x28, 0x4e, 0x61, 0x7, 0xad, 0xcb, 0xf3, 0x95, 0x3f, 0x59, 0x76, 0x10, 0xba, 0xdc, 0xca, 0xac, 0x6, 0x60, 0x4f, 0x29, 0x83, 0xe5, 0xdd, 0xbb, 0x11, 0x77, 0x58, 0x3e, 0x94, 0xf2, 0x6d, 0xb, 0xa1, 0xc7, 0xe8, 0x8e, 0x24, 0x42, 0x7a, 0x1c, 0xb6, 0xd0, 0xff, 0x99, 0x33, 0x55, 0x43, 0x25, 0x8f, 0xe9, 0xc6, 0xa0, 0xa, 0x6c, 0x54, 0x32, 0x98, 0xfe, 0xd1, 0xb7, 0x1d, 0x7b, 0x31, 0x57, 0xfd, 0x9b, 0xb4, 0xd2, 0x78, 0x1e, 0x26, 0x40, 0xea, 0x8c, 0xa3, 0xc5, 0x6f, 0x9, 0x1f, 0x79, 0xd3, 0xb5, 0x9a, 0xfc, 0x56, 0x30, 0x8, 0x6e, 0xc4, 0xa2, 0x8d, 0xeb, 0x41, 0x27, 0xd5, 0xb3, 0x19, 0x7f, 0x50, 0x36, 0x9c, 0xfa, 0xc2, 0xa4, 0xe, 0x68, 0x47, 0x21, 0x8b, 0xed, 0xfb, 0x9d, 0x37, 0x51, 0x7e, 0x18, 0xb2, 0xd4, 0xec, 0x8a, 0x20, 0x46, 0x69, 0xf, 0xa5, 0xc3, 0x89, 0xef, 0x45, 0x23, 0xc, 0x6a, 0xc0, 0xa6, 0x9e, 0xf8, 0x52, 0x34, 0x1b, 0x7d, 0xd7, 0xb1, 0xa7, 0xc1, 0x6b, 0xd, 0x22, 0x44, 0xee, 0x88, 0xb0, 0xd6, 0x7c, 0x1a, 0x35, 0x53, 0xf9, 0x9f}, + {0x0, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37, 0x3e, 0x59, 0xf0, 0x97, 0xbf, 0xd8, 0x71, 0x16, 0x21, 0x46, 0xef, 0x88, 0xa0, 0xc7, 0x6e, 0x9, 0x7c, 0x1b, 0xb2, 0xd5, 0xfd, 0x9a, 0x33, 0x54, 0x63, 0x4, 0xad, 0xca, 0xe2, 0x85, 0x2c, 0x4b, 0x42, 0x25, 0x8c, 0xeb, 0xc3, 0xa4, 0xd, 0x6a, 0x5d, 0x3a, 0x93, 0xf4, 0xdc, 0xbb, 0x12, 0x75, 0xf8, 0x9f, 0x36, 0x51, 0x79, 0x1e, 0xb7, 0xd0, 0xe7, 0x80, 0x29, 0x4e, 0x66, 0x1, 0xa8, 0xcf, 0xc6, 0xa1, 0x8, 0x6f, 0x47, 0x20, 0x89, 0xee, 0xd9, 0xbe, 0x17, 0x70, 0x58, 0x3f, 0x96, 0xf1, 0x84, 0xe3, 0x4a, 0x2d, 0x5, 0x62, 0xcb, 0xac, 0x9b, 0xfc, 0x55, 0x32, 0x1a, 0x7d, 0xd4, 0xb3, 0xba, 0xdd, 0x74, 0x13, 0x3b, 0x5c, 0xf5, 0x92, 0xa5, 0xc2, 0x6b, 0xc, 0x24, 0x43, 0xea, 0x8d, 0xed, 0x8a, 0x23, 0x44, 0x6c, 0xb, 0xa2, 0xc5, 0xf2, 0x95, 0x3c, 0x5b, 0x73, 0x14, 0xbd, 0xda, 0xd3, 0xb4, 0x1d, 0x7a, 0x52, 0x35, 0x9c, 0xfb, 0xcc, 0xab, 0x2, 0x65, 0x4d, 0x2a, 0x83, 0xe4, 0x91, 0xf6, 0x5f, 0x38, 0x10, 0x77, 0xde, 0xb9, 0x8e, 0xe9, 0x40, 0x27, 0xf, 0x68, 0xc1, 0xa6, 0xaf, 0xc8, 0x61, 0x6, 0x2e, 0x49, 0xe0, 0x87, 0xb0, 0xd7, 0x7e, 0x19, 0x31, 0x56, 0xff, 0x98, 0x15, 0x72, 0xdb, 0xbc, 0x94, 0xf3, 0x5a, 0x3d, 0xa, 0x6d, 0xc4, 0xa3, 0x8b, 0xec, 0x45, 0x22, 0x2b, 0x4c, 0xe5, 0x82, 0xaa, 0xcd, 0x64, 0x3, 0x34, 0x53, 0xfa, 0x9d, 0xb5, 0xd2, 0x7b, 0x1c, 0x69, 0xe, 0xa7, 0xc0, 0xe8, 0x8f, 0x26, 0x41, 0x76, 0x11, 0xb8, 0xdf, 0xf7, 0x90, 0x39, 0x5e, 0x57, 0x30, 0x99, 0xfe, 0xd6, 0xb1, 0x18, 0x7f, 0x48, 0x2f, 0x86, 0xe1, 0xc9, 0xae, 0x7, 0x60}, + {0x0, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x5, 0x67, 0xf, 0xb7, 0xdf, 0xda, 0xb2, 0xa, 0x62, 0xce, 0xa6, 0x1e, 0x76, 0x73, 0x1b, 0xa3, 0xcb, 0xa9, 0xc1, 0x79, 0x11, 0x14, 0x7c, 0xc4, 0xac, 0x81, 0xe9, 0x51, 0x39, 0x3c, 0x54, 0xec, 0x84, 0xe6, 0x8e, 0x36, 0x5e, 0x5b, 0x33, 0x8b, 0xe3, 0x4f, 0x27, 0x9f, 0xf7, 0xf2, 0x9a, 0x22, 0x4a, 0x28, 0x40, 0xf8, 0x90, 0x95, 0xfd, 0x45, 0x2d, 0x1f, 0x77, 0xcf, 0xa7, 0xa2, 0xca, 0x72, 0x1a, 0x78, 0x10, 0xa8, 0xc0, 0xc5, 0xad, 0x15, 0x7d, 0xd1, 0xb9, 0x1, 0x69, 0x6c, 0x4, 0xbc, 0xd4, 0xb6, 0xde, 0x66, 0xe, 0xb, 0x63, 0xdb, 0xb3, 0x9e, 0xf6, 0x4e, 0x26, 0x23, 0x4b, 0xf3, 0x9b, 0xf9, 0x91, 0x29, 0x41, 0x44, 0x2c, 0x94, 0xfc, 0x50, 0x38, 0x80, 0xe8, 0xed, 0x85, 0x3d, 0x55, 0x37, 0x5f, 0xe7, 0x8f, 0x8a, 0xe2, 0x5a, 0x32, 0x3e, 0x56, 0xee, 0x86, 0x83, 0xeb, 0x53, 0x3b, 0x59, 0x31, 0x89, 0xe1, 0xe4, 0x8c, 0x34, 0x5c, 0xf0, 0x98, 0x20, 0x48, 0x4d, 0x25, 0x9d, 0xf5, 0x97, 0xff, 0x47, 0x2f, 0x2a, 0x42, 0xfa, 0x92, 0xbf, 0xd7, 0x6f, 0x7, 0x2, 0x6a, 0xd2, 0xba, 0xd8, 0xb0, 0x8, 0x60, 0x65, 0xd, 0xb5, 0xdd, 0x71, 0x19, 0xa1, 0xc9, 0xcc, 0xa4, 0x1c, 0x74, 0x16, 0x7e, 0xc6, 0xae, 0xab, 0xc3, 0x7b, 0x13, 0x21, 0x49, 0xf1, 0x99, 0x9c, 0xf4, 0x4c, 0x24, 0x46, 0x2e, 0x96, 0xfe, 0xfb, 0x93, 0x2b, 0x43, 0xef, 0x87, 0x3f, 0x57, 0x52, 0x3a, 0x82, 0xea, 0x88, 0xe0, 0x58, 0x30, 0x35, 0x5d, 0xe5, 0x8d, 0xa0, 0xc8, 0x70, 0x18, 0x1d, 0x75, 0xcd, 0xa5, 0xc7, 0xaf, 0x17, 0x7f, 0x7a, 0x12, 0xaa, 0xc2, 0x6e, 0x6, 0xbe, 0xd6, 0xd3, 0xbb, 0x3, 0x6b, 0x9, 0x61, 0xd9, 0xb1, 0xb4, 0xdc, 0x64, 0xc}, + {0x0, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x2, 0x6f, 0x6, 0xbd, 0xd4, 0xd6, 0xbf, 0x4, 0x6d, 0xde, 0xb7, 0xc, 0x65, 0x67, 0xe, 0xb5, 0xdc, 0xb1, 0xd8, 0x63, 0xa, 0x8, 0x61, 0xda, 0xb3, 0xa1, 0xc8, 0x73, 0x1a, 0x18, 0x71, 0xca, 0xa3, 0xce, 0xa7, 0x1c, 0x75, 0x77, 0x1e, 0xa5, 0xcc, 0x7f, 0x16, 0xad, 0xc4, 0xc6, 0xaf, 0x14, 0x7d, 0x10, 0x79, 0xc2, 0xab, 0xa9, 0xc0, 0x7b, 0x12, 0x5f, 0x36, 0x8d, 0xe4, 0xe6, 0x8f, 0x34, 0x5d, 0x30, 0x59, 0xe2, 0x8b, 0x89, 0xe0, 0x5b, 0x32, 0x81, 0xe8, 0x53, 0x3a, 0x38, 0x51, 0xea, 0x83, 0xee, 0x87, 0x3c, 0x55, 0x57, 0x3e, 0x85, 0xec, 0xfe, 0x97, 0x2c, 0x45, 0x47, 0x2e, 0x95, 0xfc, 0x91, 0xf8, 0x43, 0x2a, 0x28, 0x41, 0xfa, 0x93, 0x20, 0x49, 0xf2, 0x9b, 0x99, 0xf0, 0x4b, 0x22, 0x4f, 0x26, 0x9d, 0xf4, 0xf6, 0x9f, 0x24, 0x4d, 0xbe, 0xd7, 0x6c, 0x5, 0x7, 0x6e, 0xd5, 0xbc, 0xd1, 0xb8, 0x3, 0x6a, 0x68, 0x1, 0xba, 0xd3, 0x60, 0x9, 0xb2, 0xdb, 0xd9, 0xb0, 0xb, 0x62, 0xf, 0x66, 0xdd, 0xb4, 0xb6, 0xdf, 0x64, 0xd, 0x1f, 0x76, 0xcd, 0xa4, 0xa6, 0xcf, 0x74, 0x1d, 0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72, 0xc1, 0xa8, 0x13, 0x7a, 0x78, 0x11, 0xaa, 0xc3, 0xae, 0xc7, 0x7c, 0x15, 0x17, 0x7e, 0xc5, 0xac, 0xe1, 0x88, 0x33, 0x5a, 0x58, 0x31, 0x8a, 0xe3, 0x8e, 0xe7, 0x5c, 0x35, 0x37, 0x5e, 0xe5, 0x8c, 0x3f, 0x56, 0xed, 0x84, 0x86, 0xef, 0x54, 0x3d, 0x50, 0x39, 0x82, 0xeb, 0xe9, 0x80, 0x3b, 0x52, 0x40, 0x29, 0x92, 0xfb, 0xf9, 0x90, 0x2b, 0x42, 0x2f, 0x46, 0xfd, 0x94, 0x96, 0xff, 0x44, 0x2d, 0x9e, 0xf7, 0x4c, 0x25, 0x27, 0x4e, 0xf5, 0x9c, 0xf1, 0x98, 0x23, 0x4a, 0x48, 0x21, 0x9a, 0xf3}, + {0x0, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0xb, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c, 0xee, 0x84, 0x3a, 0x50, 0x5b, 0x31, 0x8f, 0xe5, 0x99, 0xf3, 0x4d, 0x27, 0x2c, 0x46, 0xf8, 0x92, 0xc1, 0xab, 0x15, 0x7f, 0x74, 0x1e, 0xa0, 0xca, 0xb6, 0xdc, 0x62, 0x8, 0x3, 0x69, 0xd7, 0xbd, 0x2f, 0x45, 0xfb, 0x91, 0x9a, 0xf0, 0x4e, 0x24, 0x58, 0x32, 0x8c, 0xe6, 0xed, 0x87, 0x39, 0x53, 0x9f, 0xf5, 0x4b, 0x21, 0x2a, 0x40, 0xfe, 0x94, 0xe8, 0x82, 0x3c, 0x56, 0x5d, 0x37, 0x89, 0xe3, 0x71, 0x1b, 0xa5, 0xcf, 0xc4, 0xae, 0x10, 0x7a, 0x6, 0x6c, 0xd2, 0xb8, 0xb3, 0xd9, 0x67, 0xd, 0x5e, 0x34, 0x8a, 0xe0, 0xeb, 0x81, 0x3f, 0x55, 0x29, 0x43, 0xfd, 0x97, 0x9c, 0xf6, 0x48, 0x22, 0xb0, 0xda, 0x64, 0xe, 0x5, 0x6f, 0xd1, 0xbb, 0xc7, 0xad, 0x13, 0x79, 0x72, 0x18, 0xa6, 0xcc, 0x23, 0x49, 0xf7, 0x9d, 0x96, 0xfc, 0x42, 0x28, 0x54, 0x3e, 0x80, 0xea, 0xe1, 0x8b, 0x35, 0x5f, 0xcd, 0xa7, 0x19, 0x73, 0x78, 0x12, 0xac, 0xc6, 0xba, 0xd0, 0x6e, 0x4, 0xf, 0x65, 0xdb, 0xb1, 0xe2, 0x88, 0x36, 0x5c, 0x57, 0x3d, 0x83, 0xe9, 0x95, 0xff, 0x41, 0x2b, 0x20, 0x4a, 0xf4, 0x9e, 0xc, 0x66, 0xd8, 0xb2, 0xb9, 0xd3, 0x6d, 0x7, 0x7b, 0x11, 0xaf, 0xc5, 0xce, 0xa4, 0x1a, 0x70, 0xbc, 0xd6, 0x68, 0x2, 0x9, 0x63, 0xdd, 0xb7, 0xcb, 0xa1, 0x1f, 0x75, 0x7e, 0x14, 0xaa, 0xc0, 0x52, 0x38, 0x86, 0xec, 0xe7, 0x8d, 0x33, 0x59, 0x25, 0x4f, 0xf1, 0x9b, 0x90, 0xfa, 0x44, 0x2e, 0x7d, 0x17, 0xa9, 0xc3, 0xc8, 0xa2, 0x1c, 0x76, 0xa, 0x60, 0xde, 0xb4, 0xbf, 0xd5, 0x6b, 0x1, 0x93, 0xf9, 0x47, 0x2d, 0x26, 0x4c, 0xf2, 0x98, 0xe4, 0x8e, 0x30, 0x5a, 0x51, 0x3b, 0x85, 0xef}, + {0x0, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0xc, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73, 0xfe, 0x95, 0x28, 0x43, 0x4f, 0x24, 0x99, 0xf2, 0x81, 0xea, 0x57, 0x3c, 0x30, 0x5b, 0xe6, 0x8d, 0xe1, 0x8a, 0x37, 0x5c, 0x50, 0x3b, 0x86, 0xed, 0x9e, 0xf5, 0x48, 0x23, 0x2f, 0x44, 0xf9, 0x92, 0x1f, 0x74, 0xc9, 0xa2, 0xae, 0xc5, 0x78, 0x13, 0x60, 0xb, 0xb6, 0xdd, 0xd1, 0xba, 0x7, 0x6c, 0xdf, 0xb4, 0x9, 0x62, 0x6e, 0x5, 0xb8, 0xd3, 0xa0, 0xcb, 0x76, 0x1d, 0x11, 0x7a, 0xc7, 0xac, 0x21, 0x4a, 0xf7, 0x9c, 0x90, 0xfb, 0x46, 0x2d, 0x5e, 0x35, 0x88, 0xe3, 0xef, 0x84, 0x39, 0x52, 0x3e, 0x55, 0xe8, 0x83, 0x8f, 0xe4, 0x59, 0x32, 0x41, 0x2a, 0x97, 0xfc, 0xf0, 0x9b, 0x26, 0x4d, 0xc0, 0xab, 0x16, 0x7d, 0x71, 0x1a, 0xa7, 0xcc, 0xbf, 0xd4, 0x69, 0x2, 0xe, 0x65, 0xd8, 0xb3, 0xa3, 0xc8, 0x75, 0x1e, 0x12, 0x79, 0xc4, 0xaf, 0xdc, 0xb7, 0xa, 0x61, 0x6d, 0x6, 0xbb, 0xd0, 0x5d, 0x36, 0x8b, 0xe0, 0xec, 0x87, 0x3a, 0x51, 0x22, 0x49, 0xf4, 0x9f, 0x93, 0xf8, 0x45, 0x2e, 0x42, 0x29, 0x94, 0xff, 0xf3, 0x98, 0x25, 0x4e, 0x3d, 0x56, 0xeb, 0x80, 0x8c, 0xe7, 0x5a, 0x31, 0xbc, 0xd7, 0x6a, 0x1, 0xd, 0x66, 0xdb, 0xb0, 0xc3, 0xa8, 0x15, 0x7e, 0x72, 0x19, 0xa4, 0xcf, 0x7c, 0x17, 0xaa, 0xc1, 0xcd, 0xa6, 0x1b, 0x70, 0x3, 0x68, 0xd5, 0xbe, 0xb2, 0xd9, 0x64, 0xf, 0x82, 0xe9, 0x54, 0x3f, 0x33, 0x58, 0xe5, 0x8e, 0xfd, 0x96, 0x2b, 0x40, 0x4c, 0x27, 0x9a, 0xf1, 0x9d, 0xf6, 0x4b, 0x20, 0x2c, 0x47, 0xfa, 0x91, 0xe2, 0x89, 0x34, 0x5f, 0x53, 0x38, 0x85, 0xee, 0x63, 0x8, 0xb5, 0xde, 0xd2, 0xb9, 0x4, 0x6f, 0x1c, 0x77, 0xca, 0xa1, 0xad, 0xc6, 0x7b, 0x10}, + {0x0, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e, 0x8e, 0xe2, 0x56, 0x3a, 0x23, 0x4f, 0xfb, 0x97, 0xc9, 0xa5, 0x11, 0x7d, 0x64, 0x8, 0xbc, 0xd0, 0x1, 0x6d, 0xd9, 0xb5, 0xac, 0xc0, 0x74, 0x18, 0x46, 0x2a, 0x9e, 0xf2, 0xeb, 0x87, 0x33, 0x5f, 0x8f, 0xe3, 0x57, 0x3b, 0x22, 0x4e, 0xfa, 0x96, 0xc8, 0xa4, 0x10, 0x7c, 0x65, 0x9, 0xbd, 0xd1, 0x2, 0x6e, 0xda, 0xb6, 0xaf, 0xc3, 0x77, 0x1b, 0x45, 0x29, 0x9d, 0xf1, 0xe8, 0x84, 0x30, 0x5c, 0x8c, 0xe0, 0x54, 0x38, 0x21, 0x4d, 0xf9, 0x95, 0xcb, 0xa7, 0x13, 0x7f, 0x66, 0xa, 0xbe, 0xd2, 0x3, 0x6f, 0xdb, 0xb7, 0xae, 0xc2, 0x76, 0x1a, 0x44, 0x28, 0x9c, 0xf0, 0xe9, 0x85, 0x31, 0x5d, 0x8d, 0xe1, 0x55, 0x39, 0x20, 0x4c, 0xf8, 0x94, 0xca, 0xa6, 0x12, 0x7e, 0x67, 0xb, 0xbf, 0xd3, 0x4, 0x68, 0xdc, 0xb0, 0xa9, 0xc5, 0x71, 0x1d, 0x43, 0x2f, 0x9b, 0xf7, 0xee, 0x82, 0x36, 0x5a, 0x8a, 0xe6, 0x52, 0x3e, 0x27, 0x4b, 0xff, 0x93, 0xcd, 0xa1, 0x15, 0x79, 0x60, 0xc, 0xb8, 0xd4, 0x5, 0x69, 0xdd, 0xb1, 0xa8, 0xc4, 0x70, 0x1c, 0x42, 0x2e, 0x9a, 0xf6, 0xef, 0x83, 0x37, 0x5b, 0x8b, 0xe7, 0x53, 0x3f, 0x26, 0x4a, 0xfe, 0x92, 0xcc, 0xa0, 0x14, 0x78, 0x61, 0xd, 0xb9, 0xd5, 0x6, 0x6a, 0xde, 0xb2, 0xab, 0xc7, 0x73, 0x1f, 0x41, 0x2d, 0x99, 0xf5, 0xec, 0x80, 0x34, 0x58, 0x88, 0xe4, 0x50, 0x3c, 0x25, 0x49, 0xfd, 0x91, 0xcf, 0xa3, 0x17, 0x7b, 0x62, 0xe, 0xba, 0xd6, 0x7, 0x6b, 0xdf, 0xb3, 0xaa, 0xc6, 0x72, 0x1e, 0x40, 0x2c, 0x98, 0xf4, 0xed, 0x81, 0x35, 0x59, 0x89, 0xe5, 0x51, 0x3d, 0x24, 0x48, 0xfc, 0x90, 0xce, 0xa2, 0x16, 0x7a, 0x63, 0xf, 0xbb, 0xd7}, + {0x0, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51, 0x9e, 0xf3, 0x44, 0x29, 0x37, 0x5a, 0xed, 0x80, 0xd1, 0xbc, 0xb, 0x66, 0x78, 0x15, 0xa2, 0xcf, 0x21, 0x4c, 0xfb, 0x96, 0x88, 0xe5, 0x52, 0x3f, 0x6e, 0x3, 0xb4, 0xd9, 0xc7, 0xaa, 0x1d, 0x70, 0xbf, 0xd2, 0x65, 0x8, 0x16, 0x7b, 0xcc, 0xa1, 0xf0, 0x9d, 0x2a, 0x47, 0x59, 0x34, 0x83, 0xee, 0x42, 0x2f, 0x98, 0xf5, 0xeb, 0x86, 0x31, 0x5c, 0xd, 0x60, 0xd7, 0xba, 0xa4, 0xc9, 0x7e, 0x13, 0xdc, 0xb1, 0x6, 0x6b, 0x75, 0x18, 0xaf, 0xc2, 0x93, 0xfe, 0x49, 0x24, 0x3a, 0x57, 0xe0, 0x8d, 0x63, 0xe, 0xb9, 0xd4, 0xca, 0xa7, 0x10, 0x7d, 0x2c, 0x41, 0xf6, 0x9b, 0x85, 0xe8, 0x5f, 0x32, 0xfd, 0x90, 0x27, 0x4a, 0x54, 0x39, 0x8e, 0xe3, 0xb2, 0xdf, 0x68, 0x5, 0x1b, 0x76, 0xc1, 0xac, 0x84, 0xe9, 0x5e, 0x33, 0x2d, 0x40, 0xf7, 0x9a, 0xcb, 0xa6, 0x11, 0x7c, 0x62, 0xf, 0xb8, 0xd5, 0x1a, 0x77, 0xc0, 0xad, 0xb3, 0xde, 0x69, 0x4, 0x55, 0x38, 0x8f, 0xe2, 0xfc, 0x91, 0x26, 0x4b, 0xa5, 0xc8, 0x7f, 0x12, 0xc, 0x61, 0xd6, 0xbb, 0xea, 0x87, 0x30, 0x5d, 0x43, 0x2e, 0x99, 0xf4, 0x3b, 0x56, 0xe1, 0x8c, 0x92, 0xff, 0x48, 0x25, 0x74, 0x19, 0xae, 0xc3, 0xdd, 0xb0, 0x7, 0x6a, 0xc6, 0xab, 0x1c, 0x71, 0x6f, 0x2, 0xb5, 0xd8, 0x89, 0xe4, 0x53, 0x3e, 0x20, 0x4d, 0xfa, 0x97, 0x58, 0x35, 0x82, 0xef, 0xf1, 0x9c, 0x2b, 0x46, 0x17, 0x7a, 0xcd, 0xa0, 0xbe, 0xd3, 0x64, 0x9, 0xe7, 0x8a, 0x3d, 0x50, 0x4e, 0x23, 0x94, 0xf9, 0xa8, 0xc5, 0x72, 0x1f, 0x1, 0x6c, 0xdb, 0xb6, 0x79, 0x14, 0xa3, 0xce, 0xd0, 0xbd, 0xa, 0x67, 0x36, 0x5b, 0xec, 0x81, 0x9f, 0xf2, 0x45, 0x28}, + {0x0, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40, 0xae, 0xc0, 0x72, 0x1c, 0xb, 0x65, 0xd7, 0xb9, 0xf9, 0x97, 0x25, 0x4b, 0x5c, 0x32, 0x80, 0xee, 0x41, 0x2f, 0x9d, 0xf3, 0xe4, 0x8a, 0x38, 0x56, 0x16, 0x78, 0xca, 0xa4, 0xb3, 0xdd, 0x6f, 0x1, 0xef, 0x81, 0x33, 0x5d, 0x4a, 0x24, 0x96, 0xf8, 0xb8, 0xd6, 0x64, 0xa, 0x1d, 0x73, 0xc1, 0xaf, 0x82, 0xec, 0x5e, 0x30, 0x27, 0x49, 0xfb, 0x95, 0xd5, 0xbb, 0x9, 0x67, 0x70, 0x1e, 0xac, 0xc2, 0x2c, 0x42, 0xf0, 0x9e, 0x89, 0xe7, 0x55, 0x3b, 0x7b, 0x15, 0xa7, 0xc9, 0xde, 0xb0, 0x2, 0x6c, 0xc3, 0xad, 0x1f, 0x71, 0x66, 0x8, 0xba, 0xd4, 0x94, 0xfa, 0x48, 0x26, 0x31, 0x5f, 0xed, 0x83, 0x6d, 0x3, 0xb1, 0xdf, 0xc8, 0xa6, 0x14, 0x7a, 0x3a, 0x54, 0xe6, 0x88, 0x9f, 0xf1, 0x43, 0x2d, 0x19, 0x77, 0xc5, 0xab, 0xbc, 0xd2, 0x60, 0xe, 0x4e, 0x20, 0x92, 0xfc, 0xeb, 0x85, 0x37, 0x59, 0xb7, 0xd9, 0x6b, 0x5, 0x12, 0x7c, 0xce, 0xa0, 0xe0, 0x8e, 0x3c, 0x52, 0x45, 0x2b, 0x99, 0xf7, 0x58, 0x36, 0x84, 0xea, 0xfd, 0x93, 0x21, 0x4f, 0xf, 0x61, 0xd3, 0xbd, 0xaa, 0xc4, 0x76, 0x18, 0xf6, 0x98, 0x2a, 0x44, 0x53, 0x3d, 0x8f, 0xe1, 0xa1, 0xcf, 0x7d, 0x13, 0x4, 0x6a, 0xd8, 0xb6, 0x9b, 0xf5, 0x47, 0x29, 0x3e, 0x50, 0xe2, 0x8c, 0xcc, 0xa2, 0x10, 0x7e, 0x69, 0x7, 0xb5, 0xdb, 0x35, 0x5b, 0xe9, 0x87, 0x90, 0xfe, 0x4c, 0x22, 0x62, 0xc, 0xbe, 0xd0, 0xc7, 0xa9, 0x1b, 0x75, 0xda, 0xb4, 0x6, 0x68, 0x7f, 0x11, 0xa3, 0xcd, 0x8d, 0xe3, 0x51, 0x3f, 0x28, 0x46, 0xf4, 0x9a, 0x74, 0x1a, 0xa8, 0xc6, 0xd1, 0xbf, 0xd, 0x63, 0x23, 0x4d, 0xff, 0x91, 0x86, 0xe8, 0x5a, 0x34}, + {0x0, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f, 0xbe, 0xd1, 0x60, 0xf, 0x1f, 0x70, 0xc1, 0xae, 0xe1, 0x8e, 0x3f, 0x50, 0x40, 0x2f, 0x9e, 0xf1, 0x61, 0xe, 0xbf, 0xd0, 0xc0, 0xaf, 0x1e, 0x71, 0x3e, 0x51, 0xe0, 0x8f, 0x9f, 0xf0, 0x41, 0x2e, 0xdf, 0xb0, 0x1, 0x6e, 0x7e, 0x11, 0xa0, 0xcf, 0x80, 0xef, 0x5e, 0x31, 0x21, 0x4e, 0xff, 0x90, 0xc2, 0xad, 0x1c, 0x73, 0x63, 0xc, 0xbd, 0xd2, 0x9d, 0xf2, 0x43, 0x2c, 0x3c, 0x53, 0xe2, 0x8d, 0x7c, 0x13, 0xa2, 0xcd, 0xdd, 0xb2, 0x3, 0x6c, 0x23, 0x4c, 0xfd, 0x92, 0x82, 0xed, 0x5c, 0x33, 0xa3, 0xcc, 0x7d, 0x12, 0x2, 0x6d, 0xdc, 0xb3, 0xfc, 0x93, 0x22, 0x4d, 0x5d, 0x32, 0x83, 0xec, 0x1d, 0x72, 0xc3, 0xac, 0xbc, 0xd3, 0x62, 0xd, 0x42, 0x2d, 0x9c, 0xf3, 0xe3, 0x8c, 0x3d, 0x52, 0x99, 0xf6, 0x47, 0x28, 0x38, 0x57, 0xe6, 0x89, 0xc6, 0xa9, 0x18, 0x77, 0x67, 0x8, 0xb9, 0xd6, 0x27, 0x48, 0xf9, 0x96, 0x86, 0xe9, 0x58, 0x37, 0x78, 0x17, 0xa6, 0xc9, 0xd9, 0xb6, 0x7, 0x68, 0xf8, 0x97, 0x26, 0x49, 0x59, 0x36, 0x87, 0xe8, 0xa7, 0xc8, 0x79, 0x16, 0x6, 0x69, 0xd8, 0xb7, 0x46, 0x29, 0x98, 0xf7, 0xe7, 0x88, 0x39, 0x56, 0x19, 0x76, 0xc7, 0xa8, 0xb8, 0xd7, 0x66, 0x9, 0x5b, 0x34, 0x85, 0xea, 0xfa, 0x95, 0x24, 0x4b, 0x4, 0x6b, 0xda, 0xb5, 0xa5, 0xca, 0x7b, 0x14, 0xe5, 0x8a, 0x3b, 0x54, 0x44, 0x2b, 0x9a, 0xf5, 0xba, 0xd5, 0x64, 0xb, 0x1b, 0x74, 0xc5, 0xaa, 0x3a, 0x55, 0xe4, 0x8b, 0x9b, 0xf4, 0x45, 0x2a, 0x65, 0xa, 0xbb, 0xd4, 0xc4, 0xab, 0x1a, 0x75, 0x84, 0xeb, 0x5a, 0x35, 0x25, 0x4a, 0xfb, 0x94, 0xdb, 0xb4, 0x5, 0x6a, 0x7a, 0x15, 0xa4, 0xcb}, + {0x0, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0xa, 0x9a, 0xea, 0x53, 0x23, 0xb3, 0xc3, 0x8e, 0xfe, 0x6e, 0x1e, 0xf4, 0x84, 0x14, 0x64, 0x29, 0x59, 0xc9, 0xb9, 0xa6, 0xd6, 0x46, 0x36, 0x7b, 0xb, 0x9b, 0xeb, 0x1, 0x71, 0xe1, 0x91, 0xdc, 0xac, 0x3c, 0x4c, 0xf5, 0x85, 0x15, 0x65, 0x28, 0x58, 0xc8, 0xb8, 0x52, 0x22, 0xb2, 0xc2, 0x8f, 0xff, 0x6f, 0x1f, 0x51, 0x21, 0xb1, 0xc1, 0x8c, 0xfc, 0x6c, 0x1c, 0xf6, 0x86, 0x16, 0x66, 0x2b, 0x5b, 0xcb, 0xbb, 0x2, 0x72, 0xe2, 0x92, 0xdf, 0xaf, 0x3f, 0x4f, 0xa5, 0xd5, 0x45, 0x35, 0x78, 0x8, 0x98, 0xe8, 0xf7, 0x87, 0x17, 0x67, 0x2a, 0x5a, 0xca, 0xba, 0x50, 0x20, 0xb0, 0xc0, 0x8d, 0xfd, 0x6d, 0x1d, 0xa4, 0xd4, 0x44, 0x34, 0x79, 0x9, 0x99, 0xe9, 0x3, 0x73, 0xe3, 0x93, 0xde, 0xae, 0x3e, 0x4e, 0xa2, 0xd2, 0x42, 0x32, 0x7f, 0xf, 0x9f, 0xef, 0x5, 0x75, 0xe5, 0x95, 0xd8, 0xa8, 0x38, 0x48, 0xf1, 0x81, 0x11, 0x61, 0x2c, 0x5c, 0xcc, 0xbc, 0x56, 0x26, 0xb6, 0xc6, 0x8b, 0xfb, 0x6b, 0x1b, 0x4, 0x74, 0xe4, 0x94, 0xd9, 0xa9, 0x39, 0x49, 0xa3, 0xd3, 0x43, 0x33, 0x7e, 0xe, 0x9e, 0xee, 0x57, 0x27, 0xb7, 0xc7, 0x8a, 0xfa, 0x6a, 0x1a, 0xf0, 0x80, 0x10, 0x60, 0x2d, 0x5d, 0xcd, 0xbd, 0xf3, 0x83, 0x13, 0x63, 0x2e, 0x5e, 0xce, 0xbe, 0x54, 0x24, 0xb4, 0xc4, 0x89, 0xf9, 0x69, 0x19, 0xa0, 0xd0, 0x40, 0x30, 0x7d, 0xd, 0x9d, 0xed, 0x7, 0x77, 0xe7, 0x97, 0xda, 0xaa, 0x3a, 0x4a, 0x55, 0x25, 0xb5, 0xc5, 0x88, 0xf8, 0x68, 0x18, 0xf2, 0x82, 0x12, 0x62, 0x2f, 0x5f, 0xcf, 0xbf, 0x6, 0x76, 0xe6, 0x96, 0xdb, 0xab, 0x3b, 0x4b, 0xa1, 0xd1, 0x41, 0x31, 0x7c, 0xc, 0x9c, 0xec}, + {0x0, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x7, 0x94, 0xe5, 0x43, 0x32, 0xa1, 0xd0, 0x9a, 0xeb, 0x78, 0x9, 0xec, 0x9d, 0xe, 0x7f, 0x35, 0x44, 0xd7, 0xa6, 0x86, 0xf7, 0x64, 0x15, 0x5f, 0x2e, 0xbd, 0xcc, 0x29, 0x58, 0xcb, 0xba, 0xf0, 0x81, 0x12, 0x63, 0xc5, 0xb4, 0x27, 0x56, 0x1c, 0x6d, 0xfe, 0x8f, 0x6a, 0x1b, 0x88, 0xf9, 0xb3, 0xc2, 0x51, 0x20, 0x11, 0x60, 0xf3, 0x82, 0xc8, 0xb9, 0x2a, 0x5b, 0xbe, 0xcf, 0x5c, 0x2d, 0x67, 0x16, 0x85, 0xf4, 0x52, 0x23, 0xb0, 0xc1, 0x8b, 0xfa, 0x69, 0x18, 0xfd, 0x8c, 0x1f, 0x6e, 0x24, 0x55, 0xc6, 0xb7, 0x97, 0xe6, 0x75, 0x4, 0x4e, 0x3f, 0xac, 0xdd, 0x38, 0x49, 0xda, 0xab, 0xe1, 0x90, 0x3, 0x72, 0xd4, 0xa5, 0x36, 0x47, 0xd, 0x7c, 0xef, 0x9e, 0x7b, 0xa, 0x99, 0xe8, 0xa2, 0xd3, 0x40, 0x31, 0x22, 0x53, 0xc0, 0xb1, 0xfb, 0x8a, 0x19, 0x68, 0x8d, 0xfc, 0x6f, 0x1e, 0x54, 0x25, 0xb6, 0xc7, 0x61, 0x10, 0x83, 0xf2, 0xb8, 0xc9, 0x5a, 0x2b, 0xce, 0xbf, 0x2c, 0x5d, 0x17, 0x66, 0xf5, 0x84, 0xa4, 0xd5, 0x46, 0x37, 0x7d, 0xc, 0x9f, 0xee, 0xb, 0x7a, 0xe9, 0x98, 0xd2, 0xa3, 0x30, 0x41, 0xe7, 0x96, 0x5, 0x74, 0x3e, 0x4f, 0xdc, 0xad, 0x48, 0x39, 0xaa, 0xdb, 0x91, 0xe0, 0x73, 0x2, 0x33, 0x42, 0xd1, 0xa0, 0xea, 0x9b, 0x8, 0x79, 0x9c, 0xed, 0x7e, 0xf, 0x45, 0x34, 0xa7, 0xd6, 0x70, 0x1, 0x92, 0xe3, 0xa9, 0xd8, 0x4b, 0x3a, 0xdf, 0xae, 0x3d, 0x4c, 0x6, 0x77, 0xe4, 0x95, 0xb5, 0xc4, 0x57, 0x26, 0x6c, 0x1d, 0x8e, 0xff, 0x1a, 0x6b, 0xf8, 0x89, 0xc3, 0xb2, 0x21, 0x50, 0xf6, 0x87, 0x14, 0x65, 0x2f, 0x5e, 0xcd, 0xbc, 0x59, 0x28, 0xbb, 0xca, 0x80, 0xf1, 0x62, 0x13}, + {0x0, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4, 0x73, 0x1, 0x97, 0xe5, 0xa6, 0xd4, 0x42, 0x30, 0xc4, 0xb6, 0x20, 0x52, 0x11, 0x63, 0xf5, 0x87, 0xe6, 0x94, 0x2, 0x70, 0x33, 0x41, 0xd7, 0xa5, 0x51, 0x23, 0xb5, 0xc7, 0x84, 0xf6, 0x60, 0x12, 0x95, 0xe7, 0x71, 0x3, 0x40, 0x32, 0xa4, 0xd6, 0x22, 0x50, 0xc6, 0xb4, 0xf7, 0x85, 0x13, 0x61, 0xd1, 0xa3, 0x35, 0x47, 0x4, 0x76, 0xe0, 0x92, 0x66, 0x14, 0x82, 0xf0, 0xb3, 0xc1, 0x57, 0x25, 0xa2, 0xd0, 0x46, 0x34, 0x77, 0x5, 0x93, 0xe1, 0x15, 0x67, 0xf1, 0x83, 0xc0, 0xb2, 0x24, 0x56, 0x37, 0x45, 0xd3, 0xa1, 0xe2, 0x90, 0x6, 0x74, 0x80, 0xf2, 0x64, 0x16, 0x55, 0x27, 0xb1, 0xc3, 0x44, 0x36, 0xa0, 0xd2, 0x91, 0xe3, 0x75, 0x7, 0xf3, 0x81, 0x17, 0x65, 0x26, 0x54, 0xc2, 0xb0, 0xbf, 0xcd, 0x5b, 0x29, 0x6a, 0x18, 0x8e, 0xfc, 0x8, 0x7a, 0xec, 0x9e, 0xdd, 0xaf, 0x39, 0x4b, 0xcc, 0xbe, 0x28, 0x5a, 0x19, 0x6b, 0xfd, 0x8f, 0x7b, 0x9, 0x9f, 0xed, 0xae, 0xdc, 0x4a, 0x38, 0x59, 0x2b, 0xbd, 0xcf, 0x8c, 0xfe, 0x68, 0x1a, 0xee, 0x9c, 0xa, 0x78, 0x3b, 0x49, 0xdf, 0xad, 0x2a, 0x58, 0xce, 0xbc, 0xff, 0x8d, 0x1b, 0x69, 0x9d, 0xef, 0x79, 0xb, 0x48, 0x3a, 0xac, 0xde, 0x6e, 0x1c, 0x8a, 0xf8, 0xbb, 0xc9, 0x5f, 0x2d, 0xd9, 0xab, 0x3d, 0x4f, 0xc, 0x7e, 0xe8, 0x9a, 0x1d, 0x6f, 0xf9, 0x8b, 0xc8, 0xba, 0x2c, 0x5e, 0xaa, 0xd8, 0x4e, 0x3c, 0x7f, 0xd, 0x9b, 0xe9, 0x88, 0xfa, 0x6c, 0x1e, 0x5d, 0x2f, 0xb9, 0xcb, 0x3f, 0x4d, 0xdb, 0xa9, 0xea, 0x98, 0xe, 0x7c, 0xfb, 0x89, 0x1f, 0x6d, 0x2e, 0x5c, 0xca, 0xb8, 0x4c, 0x3e, 0xa8, 0xda, 0x99, 0xeb, 0x7d, 0xf}, + {0x0, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb, 0x63, 0x10, 0x85, 0xf6, 0xb2, 0xc1, 0x54, 0x27, 0xdc, 0xaf, 0x3a, 0x49, 0xd, 0x7e, 0xeb, 0x98, 0xc6, 0xb5, 0x20, 0x53, 0x17, 0x64, 0xf1, 0x82, 0x79, 0xa, 0x9f, 0xec, 0xa8, 0xdb, 0x4e, 0x3d, 0xa5, 0xd6, 0x43, 0x30, 0x74, 0x7, 0x92, 0xe1, 0x1a, 0x69, 0xfc, 0x8f, 0xcb, 0xb8, 0x2d, 0x5e, 0x91, 0xe2, 0x77, 0x4, 0x40, 0x33, 0xa6, 0xd5, 0x2e, 0x5d, 0xc8, 0xbb, 0xff, 0x8c, 0x19, 0x6a, 0xf2, 0x81, 0x14, 0x67, 0x23, 0x50, 0xc5, 0xb6, 0x4d, 0x3e, 0xab, 0xd8, 0x9c, 0xef, 0x7a, 0x9, 0x57, 0x24, 0xb1, 0xc2, 0x86, 0xf5, 0x60, 0x13, 0xe8, 0x9b, 0xe, 0x7d, 0x39, 0x4a, 0xdf, 0xac, 0x34, 0x47, 0xd2, 0xa1, 0xe5, 0x96, 0x3, 0x70, 0x8b, 0xf8, 0x6d, 0x1e, 0x5a, 0x29, 0xbc, 0xcf, 0x3f, 0x4c, 0xd9, 0xaa, 0xee, 0x9d, 0x8, 0x7b, 0x80, 0xf3, 0x66, 0x15, 0x51, 0x22, 0xb7, 0xc4, 0x5c, 0x2f, 0xba, 0xc9, 0x8d, 0xfe, 0x6b, 0x18, 0xe3, 0x90, 0x5, 0x76, 0x32, 0x41, 0xd4, 0xa7, 0xf9, 0x8a, 0x1f, 0x6c, 0x28, 0x5b, 0xce, 0xbd, 0x46, 0x35, 0xa0, 0xd3, 0x97, 0xe4, 0x71, 0x2, 0x9a, 0xe9, 0x7c, 0xf, 0x4b, 0x38, 0xad, 0xde, 0x25, 0x56, 0xc3, 0xb0, 0xf4, 0x87, 0x12, 0x61, 0xae, 0xdd, 0x48, 0x3b, 0x7f, 0xc, 0x99, 0xea, 0x11, 0x62, 0xf7, 0x84, 0xc0, 0xb3, 0x26, 0x55, 0xcd, 0xbe, 0x2b, 0x58, 0x1c, 0x6f, 0xfa, 0x89, 0x72, 0x1, 0x94, 0xe7, 0xa3, 0xd0, 0x45, 0x36, 0x68, 0x1b, 0x8e, 0xfd, 0xb9, 0xca, 0x5f, 0x2c, 0xd7, 0xa4, 0x31, 0x42, 0x6, 0x75, 0xe0, 0x93, 0xb, 0x78, 0xed, 0x9e, 0xda, 0xa9, 0x3c, 0x4f, 0xb4, 0xc7, 0x52, 0x21, 0x65, 0x16, 0x83, 0xf0}, + {0x0, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6, 0x13, 0x67, 0xfb, 0x8f, 0xde, 0xaa, 0x36, 0x42, 0x94, 0xe0, 0x7c, 0x8, 0x59, 0x2d, 0xb1, 0xc5, 0x26, 0x52, 0xce, 0xba, 0xeb, 0x9f, 0x3, 0x77, 0xa1, 0xd5, 0x49, 0x3d, 0x6c, 0x18, 0x84, 0xf0, 0x35, 0x41, 0xdd, 0xa9, 0xf8, 0x8c, 0x10, 0x64, 0xb2, 0xc6, 0x5a, 0x2e, 0x7f, 0xb, 0x97, 0xe3, 0x4c, 0x38, 0xa4, 0xd0, 0x81, 0xf5, 0x69, 0x1d, 0xcb, 0xbf, 0x23, 0x57, 0x6, 0x72, 0xee, 0x9a, 0x5f, 0x2b, 0xb7, 0xc3, 0x92, 0xe6, 0x7a, 0xe, 0xd8, 0xac, 0x30, 0x44, 0x15, 0x61, 0xfd, 0x89, 0x6a, 0x1e, 0x82, 0xf6, 0xa7, 0xd3, 0x4f, 0x3b, 0xed, 0x99, 0x5, 0x71, 0x20, 0x54, 0xc8, 0xbc, 0x79, 0xd, 0x91, 0xe5, 0xb4, 0xc0, 0x5c, 0x28, 0xfe, 0x8a, 0x16, 0x62, 0x33, 0x47, 0xdb, 0xaf, 0x98, 0xec, 0x70, 0x4, 0x55, 0x21, 0xbd, 0xc9, 0x1f, 0x6b, 0xf7, 0x83, 0xd2, 0xa6, 0x3a, 0x4e, 0x8b, 0xff, 0x63, 0x17, 0x46, 0x32, 0xae, 0xda, 0xc, 0x78, 0xe4, 0x90, 0xc1, 0xb5, 0x29, 0x5d, 0xbe, 0xca, 0x56, 0x22, 0x73, 0x7, 0x9b, 0xef, 0x39, 0x4d, 0xd1, 0xa5, 0xf4, 0x80, 0x1c, 0x68, 0xad, 0xd9, 0x45, 0x31, 0x60, 0x14, 0x88, 0xfc, 0x2a, 0x5e, 0xc2, 0xb6, 0xe7, 0x93, 0xf, 0x7b, 0xd4, 0xa0, 0x3c, 0x48, 0x19, 0x6d, 0xf1, 0x85, 0x53, 0x27, 0xbb, 0xcf, 0x9e, 0xea, 0x76, 0x2, 0xc7, 0xb3, 0x2f, 0x5b, 0xa, 0x7e, 0xe2, 0x96, 0x40, 0x34, 0xa8, 0xdc, 0x8d, 0xf9, 0x65, 0x11, 0xf2, 0x86, 0x1a, 0x6e, 0x3f, 0x4b, 0xd7, 0xa3, 0x75, 0x1, 0x9d, 0xe9, 0xb8, 0xcc, 0x50, 0x24, 0xe1, 0x95, 0x9, 0x7d, 0x2c, 0x58, 0xc4, 0xb0, 0x66, 0x12, 0x8e, 0xfa, 0xab, 0xdf, 0x43, 0x37}, + {0x0, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9, 0x3, 0x76, 0xe9, 0x9c, 0xca, 0xbf, 0x20, 0x55, 0x8c, 0xf9, 0x66, 0x13, 0x45, 0x30, 0xaf, 0xda, 0x6, 0x73, 0xec, 0x99, 0xcf, 0xba, 0x25, 0x50, 0x89, 0xfc, 0x63, 0x16, 0x40, 0x35, 0xaa, 0xdf, 0x5, 0x70, 0xef, 0x9a, 0xcc, 0xb9, 0x26, 0x53, 0x8a, 0xff, 0x60, 0x15, 0x43, 0x36, 0xa9, 0xdc, 0xc, 0x79, 0xe6, 0x93, 0xc5, 0xb0, 0x2f, 0x5a, 0x83, 0xf6, 0x69, 0x1c, 0x4a, 0x3f, 0xa0, 0xd5, 0xf, 0x7a, 0xe5, 0x90, 0xc6, 0xb3, 0x2c, 0x59, 0x80, 0xf5, 0x6a, 0x1f, 0x49, 0x3c, 0xa3, 0xd6, 0xa, 0x7f, 0xe0, 0x95, 0xc3, 0xb6, 0x29, 0x5c, 0x85, 0xf0, 0x6f, 0x1a, 0x4c, 0x39, 0xa6, 0xd3, 0x9, 0x7c, 0xe3, 0x96, 0xc0, 0xb5, 0x2a, 0x5f, 0x86, 0xf3, 0x6c, 0x19, 0x4f, 0x3a, 0xa5, 0xd0, 0x18, 0x6d, 0xf2, 0x87, 0xd1, 0xa4, 0x3b, 0x4e, 0x97, 0xe2, 0x7d, 0x8, 0x5e, 0x2b, 0xb4, 0xc1, 0x1b, 0x6e, 0xf1, 0x84, 0xd2, 0xa7, 0x38, 0x4d, 0x94, 0xe1, 0x7e, 0xb, 0x5d, 0x28, 0xb7, 0xc2, 0x1e, 0x6b, 0xf4, 0x81, 0xd7, 0xa2, 0x3d, 0x48, 0x91, 0xe4, 0x7b, 0xe, 0x58, 0x2d, 0xb2, 0xc7, 0x1d, 0x68, 0xf7, 0x82, 0xd4, 0xa1, 0x3e, 0x4b, 0x92, 0xe7, 0x78, 0xd, 0x5b, 0x2e, 0xb1, 0xc4, 0x14, 0x61, 0xfe, 0x8b, 0xdd, 0xa8, 0x37, 0x42, 0x9b, 0xee, 0x71, 0x4, 0x52, 0x27, 0xb8, 0xcd, 0x17, 0x62, 0xfd, 0x88, 0xde, 0xab, 0x34, 0x41, 0x98, 0xed, 0x72, 0x7, 0x51, 0x24, 0xbb, 0xce, 0x12, 0x67, 0xf8, 0x8d, 0xdb, 0xae, 0x31, 0x44, 0x9d, 0xe8, 0x77, 0x2, 0x54, 0x21, 0xbe, 0xcb, 0x11, 0x64, 0xfb, 0x8e, 0xd8, 0xad, 0x32, 0x47, 0x9e, 0xeb, 0x74, 0x1, 0x57, 0x22, 0xbd, 0xc8}, + {0x0, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0xd, 0x52, 0x24, 0xbe, 0xc8, 0x33, 0x45, 0xdf, 0xa9, 0xf6, 0x80, 0x1a, 0x6c, 0xa4, 0xd2, 0x48, 0x3e, 0x61, 0x17, 0x8d, 0xfb, 0x66, 0x10, 0x8a, 0xfc, 0xa3, 0xd5, 0x4f, 0x39, 0xf1, 0x87, 0x1d, 0x6b, 0x34, 0x42, 0xd8, 0xae, 0x55, 0x23, 0xb9, 0xcf, 0x90, 0xe6, 0x7c, 0xa, 0xc2, 0xb4, 0x2e, 0x58, 0x7, 0x71, 0xeb, 0x9d, 0xcc, 0xba, 0x20, 0x56, 0x9, 0x7f, 0xe5, 0x93, 0x5b, 0x2d, 0xb7, 0xc1, 0x9e, 0xe8, 0x72, 0x4, 0xff, 0x89, 0x13, 0x65, 0x3a, 0x4c, 0xd6, 0xa0, 0x68, 0x1e, 0x84, 0xf2, 0xad, 0xdb, 0x41, 0x37, 0xaa, 0xdc, 0x46, 0x30, 0x6f, 0x19, 0x83, 0xf5, 0x3d, 0x4b, 0xd1, 0xa7, 0xf8, 0x8e, 0x14, 0x62, 0x99, 0xef, 0x75, 0x3, 0x5c, 0x2a, 0xb0, 0xc6, 0xe, 0x78, 0xe2, 0x94, 0xcb, 0xbd, 0x27, 0x51, 0x85, 0xf3, 0x69, 0x1f, 0x40, 0x36, 0xac, 0xda, 0x12, 0x64, 0xfe, 0x88, 0xd7, 0xa1, 0x3b, 0x4d, 0xb6, 0xc0, 0x5a, 0x2c, 0x73, 0x5, 0x9f, 0xe9, 0x21, 0x57, 0xcd, 0xbb, 0xe4, 0x92, 0x8, 0x7e, 0xe3, 0x95, 0xf, 0x79, 0x26, 0x50, 0xca, 0xbc, 0x74, 0x2, 0x98, 0xee, 0xb1, 0xc7, 0x5d, 0x2b, 0xd0, 0xa6, 0x3c, 0x4a, 0x15, 0x63, 0xf9, 0x8f, 0x47, 0x31, 0xab, 0xdd, 0x82, 0xf4, 0x6e, 0x18, 0x49, 0x3f, 0xa5, 0xd3, 0x8c, 0xfa, 0x60, 0x16, 0xde, 0xa8, 0x32, 0x44, 0x1b, 0x6d, 0xf7, 0x81, 0x7a, 0xc, 0x96, 0xe0, 0xbf, 0xc9, 0x53, 0x25, 0xed, 0x9b, 0x1, 0x77, 0x28, 0x5e, 0xc4, 0xb2, 0x2f, 0x59, 0xc3, 0xb5, 0xea, 0x9c, 0x6, 0x70, 0xb8, 0xce, 0x54, 0x22, 0x7d, 0xb, 0x91, 0xe7, 0x1c, 0x6a, 0xf0, 0x86, 0xd9, 0xaf, 0x35, 0x43, 0x8b, 0xfd, 0x67, 0x11, 0x4e, 0x38, 0xa2, 0xd4}, + {0x0, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x6, 0x5e, 0x29, 0xb0, 0xc7, 0x23, 0x54, 0xcd, 0xba, 0xe2, 0x95, 0xc, 0x7b, 0xbc, 0xcb, 0x52, 0x25, 0x7d, 0xa, 0x93, 0xe4, 0x46, 0x31, 0xa8, 0xdf, 0x87, 0xf0, 0x69, 0x1e, 0xd9, 0xae, 0x37, 0x40, 0x18, 0x6f, 0xf6, 0x81, 0x65, 0x12, 0x8b, 0xfc, 0xa4, 0xd3, 0x4a, 0x3d, 0xfa, 0x8d, 0x14, 0x63, 0x3b, 0x4c, 0xd5, 0xa2, 0x8c, 0xfb, 0x62, 0x15, 0x4d, 0x3a, 0xa3, 0xd4, 0x13, 0x64, 0xfd, 0x8a, 0xd2, 0xa5, 0x3c, 0x4b, 0xaf, 0xd8, 0x41, 0x36, 0x6e, 0x19, 0x80, 0xf7, 0x30, 0x47, 0xde, 0xa9, 0xf1, 0x86, 0x1f, 0x68, 0xca, 0xbd, 0x24, 0x53, 0xb, 0x7c, 0xe5, 0x92, 0x55, 0x22, 0xbb, 0xcc, 0x94, 0xe3, 0x7a, 0xd, 0xe9, 0x9e, 0x7, 0x70, 0x28, 0x5f, 0xc6, 0xb1, 0x76, 0x1, 0x98, 0xef, 0xb7, 0xc0, 0x59, 0x2e, 0x5, 0x72, 0xeb, 0x9c, 0xc4, 0xb3, 0x2a, 0x5d, 0x9a, 0xed, 0x74, 0x3, 0x5b, 0x2c, 0xb5, 0xc2, 0x26, 0x51, 0xc8, 0xbf, 0xe7, 0x90, 0x9, 0x7e, 0xb9, 0xce, 0x57, 0x20, 0x78, 0xf, 0x96, 0xe1, 0x43, 0x34, 0xad, 0xda, 0x82, 0xf5, 0x6c, 0x1b, 0xdc, 0xab, 0x32, 0x45, 0x1d, 0x6a, 0xf3, 0x84, 0x60, 0x17, 0x8e, 0xf9, 0xa1, 0xd6, 0x4f, 0x38, 0xff, 0x88, 0x11, 0x66, 0x3e, 0x49, 0xd0, 0xa7, 0x89, 0xfe, 0x67, 0x10, 0x48, 0x3f, 0xa6, 0xd1, 0x16, 0x61, 0xf8, 0x8f, 0xd7, 0xa0, 0x39, 0x4e, 0xaa, 0xdd, 0x44, 0x33, 0x6b, 0x1c, 0x85, 0xf2, 0x35, 0x42, 0xdb, 0xac, 0xf4, 0x83, 0x1a, 0x6d, 0xcf, 0xb8, 0x21, 0x56, 0xe, 0x79, 0xe0, 0x97, 0x50, 0x27, 0xbe, 0xc9, 0x91, 0xe6, 0x7f, 0x8, 0xec, 0x9b, 0x2, 0x75, 0x2d, 0x5a, 0xc3, 0xb4, 0x73, 0x4, 0x9d, 0xea, 0xb2, 0xc5, 0x5c, 0x2b}, + {0x0, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0xd, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92, 0xd3, 0xab, 0x23, 0x5b, 0x2e, 0x56, 0xde, 0xa6, 0x34, 0x4c, 0xc4, 0xbc, 0xc9, 0xb1, 0x39, 0x41, 0xbb, 0xc3, 0x4b, 0x33, 0x46, 0x3e, 0xb6, 0xce, 0x5c, 0x24, 0xac, 0xd4, 0xa1, 0xd9, 0x51, 0x29, 0x68, 0x10, 0x98, 0xe0, 0x95, 0xed, 0x65, 0x1d, 0x8f, 0xf7, 0x7f, 0x7, 0x72, 0xa, 0x82, 0xfa, 0x6b, 0x13, 0x9b, 0xe3, 0x96, 0xee, 0x66, 0x1e, 0x8c, 0xf4, 0x7c, 0x4, 0x71, 0x9, 0x81, 0xf9, 0xb8, 0xc0, 0x48, 0x30, 0x45, 0x3d, 0xb5, 0xcd, 0x5f, 0x27, 0xaf, 0xd7, 0xa2, 0xda, 0x52, 0x2a, 0xd0, 0xa8, 0x20, 0x58, 0x2d, 0x55, 0xdd, 0xa5, 0x37, 0x4f, 0xc7, 0xbf, 0xca, 0xb2, 0x3a, 0x42, 0x3, 0x7b, 0xf3, 0x8b, 0xfe, 0x86, 0xe, 0x76, 0xe4, 0x9c, 0x14, 0x6c, 0x19, 0x61, 0xe9, 0x91, 0xd6, 0xae, 0x26, 0x5e, 0x2b, 0x53, 0xdb, 0xa3, 0x31, 0x49, 0xc1, 0xb9, 0xcc, 0xb4, 0x3c, 0x44, 0x5, 0x7d, 0xf5, 0x8d, 0xf8, 0x80, 0x8, 0x70, 0xe2, 0x9a, 0x12, 0x6a, 0x1f, 0x67, 0xef, 0x97, 0x6d, 0x15, 0x9d, 0xe5, 0x90, 0xe8, 0x60, 0x18, 0x8a, 0xf2, 0x7a, 0x2, 0x77, 0xf, 0x87, 0xff, 0xbe, 0xc6, 0x4e, 0x36, 0x43, 0x3b, 0xb3, 0xcb, 0x59, 0x21, 0xa9, 0xd1, 0xa4, 0xdc, 0x54, 0x2c, 0xbd, 0xc5, 0x4d, 0x35, 0x40, 0x38, 0xb0, 0xc8, 0x5a, 0x22, 0xaa, 0xd2, 0xa7, 0xdf, 0x57, 0x2f, 0x6e, 0x16, 0x9e, 0xe6, 0x93, 0xeb, 0x63, 0x1b, 0x89, 0xf1, 0x79, 0x1, 0x74, 0xc, 0x84, 0xfc, 0x6, 0x7e, 0xf6, 0x8e, 0xfb, 0x83, 0xb, 0x73, 0xe1, 0x99, 0x11, 0x69, 0x1c, 0x64, 0xec, 0x94, 0xd5, 0xad, 0x25, 0x5d, 0x28, 0x50, 0xd8, 0xa0, 0x32, 0x4a, 0xc2, 0xba, 0xcf, 0xb7, 0x3f, 0x47}, + {0x0, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0xb, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d, 0xc3, 0xba, 0x31, 0x48, 0x3a, 0x43, 0xc8, 0xb1, 0x2c, 0x55, 0xde, 0xa7, 0xd5, 0xac, 0x27, 0x5e, 0x9b, 0xe2, 0x69, 0x10, 0x62, 0x1b, 0x90, 0xe9, 0x74, 0xd, 0x86, 0xff, 0x8d, 0xf4, 0x7f, 0x6, 0x58, 0x21, 0xaa, 0xd3, 0xa1, 0xd8, 0x53, 0x2a, 0xb7, 0xce, 0x45, 0x3c, 0x4e, 0x37, 0xbc, 0xc5, 0x2b, 0x52, 0xd9, 0xa0, 0xd2, 0xab, 0x20, 0x59, 0xc4, 0xbd, 0x36, 0x4f, 0x3d, 0x44, 0xcf, 0xb6, 0xe8, 0x91, 0x1a, 0x63, 0x11, 0x68, 0xe3, 0x9a, 0x7, 0x7e, 0xf5, 0x8c, 0xfe, 0x87, 0xc, 0x75, 0xb0, 0xc9, 0x42, 0x3b, 0x49, 0x30, 0xbb, 0xc2, 0x5f, 0x26, 0xad, 0xd4, 0xa6, 0xdf, 0x54, 0x2d, 0x73, 0xa, 0x81, 0xf8, 0x8a, 0xf3, 0x78, 0x1, 0x9c, 0xe5, 0x6e, 0x17, 0x65, 0x1c, 0x97, 0xee, 0x56, 0x2f, 0xa4, 0xdd, 0xaf, 0xd6, 0x5d, 0x24, 0xb9, 0xc0, 0x4b, 0x32, 0x40, 0x39, 0xb2, 0xcb, 0x95, 0xec, 0x67, 0x1e, 0x6c, 0x15, 0x9e, 0xe7, 0x7a, 0x3, 0x88, 0xf1, 0x83, 0xfa, 0x71, 0x8, 0xcd, 0xb4, 0x3f, 0x46, 0x34, 0x4d, 0xc6, 0xbf, 0x22, 0x5b, 0xd0, 0xa9, 0xdb, 0xa2, 0x29, 0x50, 0xe, 0x77, 0xfc, 0x85, 0xf7, 0x8e, 0x5, 0x7c, 0xe1, 0x98, 0x13, 0x6a, 0x18, 0x61, 0xea, 0x93, 0x7d, 0x4, 0x8f, 0xf6, 0x84, 0xfd, 0x76, 0xf, 0x92, 0xeb, 0x60, 0x19, 0x6b, 0x12, 0x99, 0xe0, 0xbe, 0xc7, 0x4c, 0x35, 0x47, 0x3e, 0xb5, 0xcc, 0x51, 0x28, 0xa3, 0xda, 0xa8, 0xd1, 0x5a, 0x23, 0xe6, 0x9f, 0x14, 0x6d, 0x1f, 0x66, 0xed, 0x94, 0x9, 0x70, 0xfb, 0x82, 0xf0, 0x89, 0x2, 0x7b, 0x25, 0x5c, 0xd7, 0xae, 0xdc, 0xa5, 0x2e, 0x57, 0xca, 0xb3, 0x38, 0x41, 0x33, 0x4a, 0xc1, 0xb8}, + {0x0, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x1, 0x7b, 0xf7, 0x8d, 0x3, 0x79, 0x2, 0x78, 0xf6, 0x8c, 0xf3, 0x89, 0x7, 0x7d, 0x6, 0x7c, 0xf2, 0x88, 0x4, 0x7e, 0xf0, 0x8a, 0xf1, 0x8b, 0x5, 0x7f, 0xfb, 0x81, 0xf, 0x75, 0xe, 0x74, 0xfa, 0x80, 0xc, 0x76, 0xf8, 0x82, 0xf9, 0x83, 0xd, 0x77, 0x8, 0x72, 0xfc, 0x86, 0xfd, 0x87, 0x9, 0x73, 0xff, 0x85, 0xb, 0x71, 0xa, 0x70, 0xfe, 0x84, 0xeb, 0x91, 0x1f, 0x65, 0x1e, 0x64, 0xea, 0x90, 0x1c, 0x66, 0xe8, 0x92, 0xe9, 0x93, 0x1d, 0x67, 0x18, 0x62, 0xec, 0x96, 0xed, 0x97, 0x19, 0x63, 0xef, 0x95, 0x1b, 0x61, 0x1a, 0x60, 0xee, 0x94, 0x10, 0x6a, 0xe4, 0x9e, 0xe5, 0x9f, 0x11, 0x6b, 0xe7, 0x9d, 0x13, 0x69, 0x12, 0x68, 0xe6, 0x9c, 0xe3, 0x99, 0x17, 0x6d, 0x16, 0x6c, 0xe2, 0x98, 0x14, 0x6e, 0xe0, 0x9a, 0xe1, 0x9b, 0x15, 0x6f, 0xcb, 0xb1, 0x3f, 0x45, 0x3e, 0x44, 0xca, 0xb0, 0x3c, 0x46, 0xc8, 0xb2, 0xc9, 0xb3, 0x3d, 0x47, 0x38, 0x42, 0xcc, 0xb6, 0xcd, 0xb7, 0x39, 0x43, 0xcf, 0xb5, 0x3b, 0x41, 0x3a, 0x40, 0xce, 0xb4, 0x30, 0x4a, 0xc4, 0xbe, 0xc5, 0xbf, 0x31, 0x4b, 0xc7, 0xbd, 0x33, 0x49, 0x32, 0x48, 0xc6, 0xbc, 0xc3, 0xb9, 0x37, 0x4d, 0x36, 0x4c, 0xc2, 0xb8, 0x34, 0x4e, 0xc0, 0xba, 0xc1, 0xbb, 0x35, 0x4f, 0x20, 0x5a, 0xd4, 0xae, 0xd5, 0xaf, 0x21, 0x5b, 0xd7, 0xad, 0x23, 0x59, 0x22, 0x58, 0xd6, 0xac, 0xd3, 0xa9, 0x27, 0x5d, 0x26, 0x5c, 0xd2, 0xa8, 0x24, 0x5e, 0xd0, 0xaa, 0xd1, 0xab, 0x25, 0x5f, 0xdb, 0xa1, 0x2f, 0x55, 0x2e, 0x54, 0xda, 0xa0, 0x2c, 0x56, 0xd8, 0xa2, 0xd9, 0xa3, 0x2d, 0x57, 0x28, 0x52, 0xdc, 0xa6, 0xdd, 0xa7, 0x29, 0x53, 0xdf, 0xa5, 0x2b, 0x51, 0x2a, 0x50, 0xde, 0xa4}, + {0x0, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x7, 0x7c, 0xff, 0x84, 0x9, 0x72, 0xe, 0x75, 0xf8, 0x83, 0xe3, 0x98, 0x15, 0x6e, 0x12, 0x69, 0xe4, 0x9f, 0x1c, 0x67, 0xea, 0x91, 0xed, 0x96, 0x1b, 0x60, 0xdb, 0xa0, 0x2d, 0x56, 0x2a, 0x51, 0xdc, 0xa7, 0x24, 0x5f, 0xd2, 0xa9, 0xd5, 0xae, 0x23, 0x58, 0x38, 0x43, 0xce, 0xb5, 0xc9, 0xb2, 0x3f, 0x44, 0xc7, 0xbc, 0x31, 0x4a, 0x36, 0x4d, 0xc0, 0xbb, 0xab, 0xd0, 0x5d, 0x26, 0x5a, 0x21, 0xac, 0xd7, 0x54, 0x2f, 0xa2, 0xd9, 0xa5, 0xde, 0x53, 0x28, 0x48, 0x33, 0xbe, 0xc5, 0xb9, 0xc2, 0x4f, 0x34, 0xb7, 0xcc, 0x41, 0x3a, 0x46, 0x3d, 0xb0, 0xcb, 0x70, 0xb, 0x86, 0xfd, 0x81, 0xfa, 0x77, 0xc, 0x8f, 0xf4, 0x79, 0x2, 0x7e, 0x5, 0x88, 0xf3, 0x93, 0xe8, 0x65, 0x1e, 0x62, 0x19, 0x94, 0xef, 0x6c, 0x17, 0x9a, 0xe1, 0x9d, 0xe6, 0x6b, 0x10, 0x4b, 0x30, 0xbd, 0xc6, 0xba, 0xc1, 0x4c, 0x37, 0xb4, 0xcf, 0x42, 0x39, 0x45, 0x3e, 0xb3, 0xc8, 0xa8, 0xd3, 0x5e, 0x25, 0x59, 0x22, 0xaf, 0xd4, 0x57, 0x2c, 0xa1, 0xda, 0xa6, 0xdd, 0x50, 0x2b, 0x90, 0xeb, 0x66, 0x1d, 0x61, 0x1a, 0x97, 0xec, 0x6f, 0x14, 0x99, 0xe2, 0x9e, 0xe5, 0x68, 0x13, 0x73, 0x8, 0x85, 0xfe, 0x82, 0xf9, 0x74, 0xf, 0x8c, 0xf7, 0x7a, 0x1, 0x7d, 0x6, 0x8b, 0xf0, 0xe0, 0x9b, 0x16, 0x6d, 0x11, 0x6a, 0xe7, 0x9c, 0x1f, 0x64, 0xe9, 0x92, 0xee, 0x95, 0x18, 0x63, 0x3, 0x78, 0xf5, 0x8e, 0xf2, 0x89, 0x4, 0x7f, 0xfc, 0x87, 0xa, 0x71, 0xd, 0x76, 0xfb, 0x80, 0x3b, 0x40, 0xcd, 0xb6, 0xca, 0xb1, 0x3c, 0x47, 0xc4, 0xbf, 0x32, 0x49, 0x35, 0x4e, 0xc3, 0xb8, 0xd8, 0xa3, 0x2e, 0x55, 0x29, 0x52, 0xdf, 0xa4, 0x27, 0x5c, 0xd1, 0xaa, 0xd6, 0xad, 0x20, 0x5b}, + {0x0, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae, 0x93, 0xef, 0x6b, 0x17, 0x7e, 0x2, 0x86, 0xfa, 0x54, 0x28, 0xac, 0xd0, 0xb9, 0xc5, 0x41, 0x3d, 0x3b, 0x47, 0xc3, 0xbf, 0xd6, 0xaa, 0x2e, 0x52, 0xfc, 0x80, 0x4, 0x78, 0x11, 0x6d, 0xe9, 0x95, 0xa8, 0xd4, 0x50, 0x2c, 0x45, 0x39, 0xbd, 0xc1, 0x6f, 0x13, 0x97, 0xeb, 0x82, 0xfe, 0x7a, 0x6, 0x76, 0xa, 0x8e, 0xf2, 0x9b, 0xe7, 0x63, 0x1f, 0xb1, 0xcd, 0x49, 0x35, 0x5c, 0x20, 0xa4, 0xd8, 0xe5, 0x99, 0x1d, 0x61, 0x8, 0x74, 0xf0, 0x8c, 0x22, 0x5e, 0xda, 0xa6, 0xcf, 0xb3, 0x37, 0x4b, 0x4d, 0x31, 0xb5, 0xc9, 0xa0, 0xdc, 0x58, 0x24, 0x8a, 0xf6, 0x72, 0xe, 0x67, 0x1b, 0x9f, 0xe3, 0xde, 0xa2, 0x26, 0x5a, 0x33, 0x4f, 0xcb, 0xb7, 0x19, 0x65, 0xe1, 0x9d, 0xf4, 0x88, 0xc, 0x70, 0xec, 0x90, 0x14, 0x68, 0x1, 0x7d, 0xf9, 0x85, 0x2b, 0x57, 0xd3, 0xaf, 0xc6, 0xba, 0x3e, 0x42, 0x7f, 0x3, 0x87, 0xfb, 0x92, 0xee, 0x6a, 0x16, 0xb8, 0xc4, 0x40, 0x3c, 0x55, 0x29, 0xad, 0xd1, 0xd7, 0xab, 0x2f, 0x53, 0x3a, 0x46, 0xc2, 0xbe, 0x10, 0x6c, 0xe8, 0x94, 0xfd, 0x81, 0x5, 0x79, 0x44, 0x38, 0xbc, 0xc0, 0xa9, 0xd5, 0x51, 0x2d, 0x83, 0xff, 0x7b, 0x7, 0x6e, 0x12, 0x96, 0xea, 0x9a, 0xe6, 0x62, 0x1e, 0x77, 0xb, 0x8f, 0xf3, 0x5d, 0x21, 0xa5, 0xd9, 0xb0, 0xcc, 0x48, 0x34, 0x9, 0x75, 0xf1, 0x8d, 0xe4, 0x98, 0x1c, 0x60, 0xce, 0xb2, 0x36, 0x4a, 0x23, 0x5f, 0xdb, 0xa7, 0xa1, 0xdd, 0x59, 0x25, 0x4c, 0x30, 0xb4, 0xc8, 0x66, 0x1a, 0x9e, 0xe2, 0x8b, 0xf7, 0x73, 0xf, 0x32, 0x4e, 0xca, 0xb6, 0xdf, 0xa3, 0x27, 0x5b, 0xf5, 0x89, 0xd, 0x71, 0x18, 0x64, 0xe0, 0x9c}, + {0x0, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1, 0x83, 0xfe, 0x79, 0x4, 0x6a, 0x17, 0x90, 0xed, 0x4c, 0x31, 0xb6, 0xcb, 0xa5, 0xd8, 0x5f, 0x22, 0x1b, 0x66, 0xe1, 0x9c, 0xf2, 0x8f, 0x8, 0x75, 0xd4, 0xa9, 0x2e, 0x53, 0x3d, 0x40, 0xc7, 0xba, 0x98, 0xe5, 0x62, 0x1f, 0x71, 0xc, 0x8b, 0xf6, 0x57, 0x2a, 0xad, 0xd0, 0xbe, 0xc3, 0x44, 0x39, 0x36, 0x4b, 0xcc, 0xb1, 0xdf, 0xa2, 0x25, 0x58, 0xf9, 0x84, 0x3, 0x7e, 0x10, 0x6d, 0xea, 0x97, 0xb5, 0xc8, 0x4f, 0x32, 0x5c, 0x21, 0xa6, 0xdb, 0x7a, 0x7, 0x80, 0xfd, 0x93, 0xee, 0x69, 0x14, 0x2d, 0x50, 0xd7, 0xaa, 0xc4, 0xb9, 0x3e, 0x43, 0xe2, 0x9f, 0x18, 0x65, 0xb, 0x76, 0xf1, 0x8c, 0xae, 0xd3, 0x54, 0x29, 0x47, 0x3a, 0xbd, 0xc0, 0x61, 0x1c, 0x9b, 0xe6, 0x88, 0xf5, 0x72, 0xf, 0x6c, 0x11, 0x96, 0xeb, 0x85, 0xf8, 0x7f, 0x2, 0xa3, 0xde, 0x59, 0x24, 0x4a, 0x37, 0xb0, 0xcd, 0xef, 0x92, 0x15, 0x68, 0x6, 0x7b, 0xfc, 0x81, 0x20, 0x5d, 0xda, 0xa7, 0xc9, 0xb4, 0x33, 0x4e, 0x77, 0xa, 0x8d, 0xf0, 0x9e, 0xe3, 0x64, 0x19, 0xb8, 0xc5, 0x42, 0x3f, 0x51, 0x2c, 0xab, 0xd6, 0xf4, 0x89, 0xe, 0x73, 0x1d, 0x60, 0xe7, 0x9a, 0x3b, 0x46, 0xc1, 0xbc, 0xd2, 0xaf, 0x28, 0x55, 0x5a, 0x27, 0xa0, 0xdd, 0xb3, 0xce, 0x49, 0x34, 0x95, 0xe8, 0x6f, 0x12, 0x7c, 0x1, 0x86, 0xfb, 0xd9, 0xa4, 0x23, 0x5e, 0x30, 0x4d, 0xca, 0xb7, 0x16, 0x6b, 0xec, 0x91, 0xff, 0x82, 0x5, 0x78, 0x41, 0x3c, 0xbb, 0xc6, 0xa8, 0xd5, 0x52, 0x2f, 0x8e, 0xf3, 0x74, 0x9, 0x67, 0x1a, 0x9d, 0xe0, 0xc2, 0xbf, 0x38, 0x45, 0x2b, 0x56, 0xd1, 0xac, 0xd, 0x70, 0xf7, 0x8a, 0xe4, 0x99, 0x1e, 0x63}, + {0x0, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0, 0xb3, 0xcd, 0x4f, 0x31, 0x56, 0x28, 0xaa, 0xd4, 0x64, 0x1a, 0x98, 0xe6, 0x81, 0xff, 0x7d, 0x3, 0x7b, 0x5, 0x87, 0xf9, 0x9e, 0xe0, 0x62, 0x1c, 0xac, 0xd2, 0x50, 0x2e, 0x49, 0x37, 0xb5, 0xcb, 0xc8, 0xb6, 0x34, 0x4a, 0x2d, 0x53, 0xd1, 0xaf, 0x1f, 0x61, 0xe3, 0x9d, 0xfa, 0x84, 0x6, 0x78, 0xf6, 0x88, 0xa, 0x74, 0x13, 0x6d, 0xef, 0x91, 0x21, 0x5f, 0xdd, 0xa3, 0xc4, 0xba, 0x38, 0x46, 0x45, 0x3b, 0xb9, 0xc7, 0xa0, 0xde, 0x5c, 0x22, 0x92, 0xec, 0x6e, 0x10, 0x77, 0x9, 0x8b, 0xf5, 0x8d, 0xf3, 0x71, 0xf, 0x68, 0x16, 0x94, 0xea, 0x5a, 0x24, 0xa6, 0xd8, 0xbf, 0xc1, 0x43, 0x3d, 0x3e, 0x40, 0xc2, 0xbc, 0xdb, 0xa5, 0x27, 0x59, 0xe9, 0x97, 0x15, 0x6b, 0xc, 0x72, 0xf0, 0x8e, 0xf1, 0x8f, 0xd, 0x73, 0x14, 0x6a, 0xe8, 0x96, 0x26, 0x58, 0xda, 0xa4, 0xc3, 0xbd, 0x3f, 0x41, 0x42, 0x3c, 0xbe, 0xc0, 0xa7, 0xd9, 0x5b, 0x25, 0x95, 0xeb, 0x69, 0x17, 0x70, 0xe, 0x8c, 0xf2, 0x8a, 0xf4, 0x76, 0x8, 0x6f, 0x11, 0x93, 0xed, 0x5d, 0x23, 0xa1, 0xdf, 0xb8, 0xc6, 0x44, 0x3a, 0x39, 0x47, 0xc5, 0xbb, 0xdc, 0xa2, 0x20, 0x5e, 0xee, 0x90, 0x12, 0x6c, 0xb, 0x75, 0xf7, 0x89, 0x7, 0x79, 0xfb, 0x85, 0xe2, 0x9c, 0x1e, 0x60, 0xd0, 0xae, 0x2c, 0x52, 0x35, 0x4b, 0xc9, 0xb7, 0xb4, 0xca, 0x48, 0x36, 0x51, 0x2f, 0xad, 0xd3, 0x63, 0x1d, 0x9f, 0xe1, 0x86, 0xf8, 0x7a, 0x4, 0x7c, 0x2, 0x80, 0xfe, 0x99, 0xe7, 0x65, 0x1b, 0xab, 0xd5, 0x57, 0x29, 0x4e, 0x30, 0xb2, 0xcc, 0xcf, 0xb1, 0x33, 0x4d, 0x2a, 0x54, 0xd6, 0xa8, 0x18, 0x66, 0xe4, 0x9a, 0xfd, 0x83, 0x1, 0x7f}, + {0x0, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf, 0xa3, 0xdc, 0x5d, 0x22, 0x42, 0x3d, 0xbc, 0xc3, 0x7c, 0x3, 0x82, 0xfd, 0x9d, 0xe2, 0x63, 0x1c, 0x5b, 0x24, 0xa5, 0xda, 0xba, 0xc5, 0x44, 0x3b, 0x84, 0xfb, 0x7a, 0x5, 0x65, 0x1a, 0x9b, 0xe4, 0xf8, 0x87, 0x6, 0x79, 0x19, 0x66, 0xe7, 0x98, 0x27, 0x58, 0xd9, 0xa6, 0xc6, 0xb9, 0x38, 0x47, 0xb6, 0xc9, 0x48, 0x37, 0x57, 0x28, 0xa9, 0xd6, 0x69, 0x16, 0x97, 0xe8, 0x88, 0xf7, 0x76, 0x9, 0x15, 0x6a, 0xeb, 0x94, 0xf4, 0x8b, 0xa, 0x75, 0xca, 0xb5, 0x34, 0x4b, 0x2b, 0x54, 0xd5, 0xaa, 0xed, 0x92, 0x13, 0x6c, 0xc, 0x73, 0xf2, 0x8d, 0x32, 0x4d, 0xcc, 0xb3, 0xd3, 0xac, 0x2d, 0x52, 0x4e, 0x31, 0xb0, 0xcf, 0xaf, 0xd0, 0x51, 0x2e, 0x91, 0xee, 0x6f, 0x10, 0x70, 0xf, 0x8e, 0xf1, 0x71, 0xe, 0x8f, 0xf0, 0x90, 0xef, 0x6e, 0x11, 0xae, 0xd1, 0x50, 0x2f, 0x4f, 0x30, 0xb1, 0xce, 0xd2, 0xad, 0x2c, 0x53, 0x33, 0x4c, 0xcd, 0xb2, 0xd, 0x72, 0xf3, 0x8c, 0xec, 0x93, 0x12, 0x6d, 0x2a, 0x55, 0xd4, 0xab, 0xcb, 0xb4, 0x35, 0x4a, 0xf5, 0x8a, 0xb, 0x74, 0x14, 0x6b, 0xea, 0x95, 0x89, 0xf6, 0x77, 0x8, 0x68, 0x17, 0x96, 0xe9, 0x56, 0x29, 0xa8, 0xd7, 0xb7, 0xc8, 0x49, 0x36, 0xc7, 0xb8, 0x39, 0x46, 0x26, 0x59, 0xd8, 0xa7, 0x18, 0x67, 0xe6, 0x99, 0xf9, 0x86, 0x7, 0x78, 0x64, 0x1b, 0x9a, 0xe5, 0x85, 0xfa, 0x7b, 0x4, 0xbb, 0xc4, 0x45, 0x3a, 0x5a, 0x25, 0xa4, 0xdb, 0x9c, 0xe3, 0x62, 0x1d, 0x7d, 0x2, 0x83, 0xfc, 0x43, 0x3c, 0xbd, 0xc2, 0xa2, 0xdd, 0x5c, 0x23, 0x3f, 0x40, 0xc1, 0xbe, 0xde, 0xa1, 0x20, 0x5f, 0xe0, 0x9f, 0x1e, 0x61, 0x1, 0x7e, 0xff, 0x80}, + {0x0, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3, 0xe8, 0x68, 0xf5, 0x75, 0xd2, 0x52, 0xcf, 0x4f, 0x9c, 0x1c, 0x81, 0x1, 0xa6, 0x26, 0xbb, 0x3b, 0xcd, 0x4d, 0xd0, 0x50, 0xf7, 0x77, 0xea, 0x6a, 0xb9, 0x39, 0xa4, 0x24, 0x83, 0x3, 0x9e, 0x1e, 0x25, 0xa5, 0x38, 0xb8, 0x1f, 0x9f, 0x2, 0x82, 0x51, 0xd1, 0x4c, 0xcc, 0x6b, 0xeb, 0x76, 0xf6, 0x87, 0x7, 0x9a, 0x1a, 0xbd, 0x3d, 0xa0, 0x20, 0xf3, 0x73, 0xee, 0x6e, 0xc9, 0x49, 0xd4, 0x54, 0x6f, 0xef, 0x72, 0xf2, 0x55, 0xd5, 0x48, 0xc8, 0x1b, 0x9b, 0x6, 0x86, 0x21, 0xa1, 0x3c, 0xbc, 0x4a, 0xca, 0x57, 0xd7, 0x70, 0xf0, 0x6d, 0xed, 0x3e, 0xbe, 0x23, 0xa3, 0x4, 0x84, 0x19, 0x99, 0xa2, 0x22, 0xbf, 0x3f, 0x98, 0x18, 0x85, 0x5, 0xd6, 0x56, 0xcb, 0x4b, 0xec, 0x6c, 0xf1, 0x71, 0x13, 0x93, 0xe, 0x8e, 0x29, 0xa9, 0x34, 0xb4, 0x67, 0xe7, 0x7a, 0xfa, 0x5d, 0xdd, 0x40, 0xc0, 0xfb, 0x7b, 0xe6, 0x66, 0xc1, 0x41, 0xdc, 0x5c, 0x8f, 0xf, 0x92, 0x12, 0xb5, 0x35, 0xa8, 0x28, 0xde, 0x5e, 0xc3, 0x43, 0xe4, 0x64, 0xf9, 0x79, 0xaa, 0x2a, 0xb7, 0x37, 0x90, 0x10, 0x8d, 0xd, 0x36, 0xb6, 0x2b, 0xab, 0xc, 0x8c, 0x11, 0x91, 0x42, 0xc2, 0x5f, 0xdf, 0x78, 0xf8, 0x65, 0xe5, 0x94, 0x14, 0x89, 0x9, 0xae, 0x2e, 0xb3, 0x33, 0xe0, 0x60, 0xfd, 0x7d, 0xda, 0x5a, 0xc7, 0x47, 0x7c, 0xfc, 0x61, 0xe1, 0x46, 0xc6, 0x5b, 0xdb, 0x8, 0x88, 0x15, 0x95, 0x32, 0xb2, 0x2f, 0xaf, 0x59, 0xd9, 0x44, 0xc4, 0x63, 0xe3, 0x7e, 0xfe, 0x2d, 0xad, 0x30, 0xb0, 0x17, 0x97, 0xa, 0x8a, 0xb1, 0x31, 0xac, 0x2c, 0x8b, 0xb, 0x96, 0x16, 0xc5, 0x45, 0xd8, 0x58, 0xff, 0x7f, 0xe2, 0x62}, + {0x0, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc, 0xf8, 0x79, 0xe7, 0x66, 0xc6, 0x47, 0xd9, 0x58, 0x84, 0x5, 0x9b, 0x1a, 0xba, 0x3b, 0xa5, 0x24, 0xed, 0x6c, 0xf2, 0x73, 0xd3, 0x52, 0xcc, 0x4d, 0x91, 0x10, 0x8e, 0xf, 0xaf, 0x2e, 0xb0, 0x31, 0x15, 0x94, 0xa, 0x8b, 0x2b, 0xaa, 0x34, 0xb5, 0x69, 0xe8, 0x76, 0xf7, 0x57, 0xd6, 0x48, 0xc9, 0xc7, 0x46, 0xd8, 0x59, 0xf9, 0x78, 0xe6, 0x67, 0xbb, 0x3a, 0xa4, 0x25, 0x85, 0x4, 0x9a, 0x1b, 0x3f, 0xbe, 0x20, 0xa1, 0x1, 0x80, 0x1e, 0x9f, 0x43, 0xc2, 0x5c, 0xdd, 0x7d, 0xfc, 0x62, 0xe3, 0x2a, 0xab, 0x35, 0xb4, 0x14, 0x95, 0xb, 0x8a, 0x56, 0xd7, 0x49, 0xc8, 0x68, 0xe9, 0x77, 0xf6, 0xd2, 0x53, 0xcd, 0x4c, 0xec, 0x6d, 0xf3, 0x72, 0xae, 0x2f, 0xb1, 0x30, 0x90, 0x11, 0x8f, 0xe, 0x93, 0x12, 0x8c, 0xd, 0xad, 0x2c, 0xb2, 0x33, 0xef, 0x6e, 0xf0, 0x71, 0xd1, 0x50, 0xce, 0x4f, 0x6b, 0xea, 0x74, 0xf5, 0x55, 0xd4, 0x4a, 0xcb, 0x17, 0x96, 0x8, 0x89, 0x29, 0xa8, 0x36, 0xb7, 0x7e, 0xff, 0x61, 0xe0, 0x40, 0xc1, 0x5f, 0xde, 0x2, 0x83, 0x1d, 0x9c, 0x3c, 0xbd, 0x23, 0xa2, 0x86, 0x7, 0x99, 0x18, 0xb8, 0x39, 0xa7, 0x26, 0xfa, 0x7b, 0xe5, 0x64, 0xc4, 0x45, 0xdb, 0x5a, 0x54, 0xd5, 0x4b, 0xca, 0x6a, 0xeb, 0x75, 0xf4, 0x28, 0xa9, 0x37, 0xb6, 0x16, 0x97, 0x9, 0x88, 0xac, 0x2d, 0xb3, 0x32, 0x92, 0x13, 0x8d, 0xc, 0xd0, 0x51, 0xcf, 0x4e, 0xee, 0x6f, 0xf1, 0x70, 0xb9, 0x38, 0xa6, 0x27, 0x87, 0x6, 0x98, 0x19, 0xc5, 0x44, 0xda, 0x5b, 0xfb, 0x7a, 0xe4, 0x65, 0x41, 0xc0, 0x5e, 0xdf, 0x7f, 0xfe, 0x60, 0xe1, 0x3d, 0xbc, 0x22, 0xa3, 0x3, 0x82, 0x1c, 0x9d}, + {0x0, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd, 0xc8, 0x4a, 0xd1, 0x53, 0xfa, 0x78, 0xe3, 0x61, 0xac, 0x2e, 0xb5, 0x37, 0x9e, 0x1c, 0x87, 0x5, 0x8d, 0xf, 0x94, 0x16, 0xbf, 0x3d, 0xa6, 0x24, 0xe9, 0x6b, 0xf0, 0x72, 0xdb, 0x59, 0xc2, 0x40, 0x45, 0xc7, 0x5c, 0xde, 0x77, 0xf5, 0x6e, 0xec, 0x21, 0xa3, 0x38, 0xba, 0x13, 0x91, 0xa, 0x88, 0x7, 0x85, 0x1e, 0x9c, 0x35, 0xb7, 0x2c, 0xae, 0x63, 0xe1, 0x7a, 0xf8, 0x51, 0xd3, 0x48, 0xca, 0xcf, 0x4d, 0xd6, 0x54, 0xfd, 0x7f, 0xe4, 0x66, 0xab, 0x29, 0xb2, 0x30, 0x99, 0x1b, 0x80, 0x2, 0x8a, 0x8, 0x93, 0x11, 0xb8, 0x3a, 0xa1, 0x23, 0xee, 0x6c, 0xf7, 0x75, 0xdc, 0x5e, 0xc5, 0x47, 0x42, 0xc0, 0x5b, 0xd9, 0x70, 0xf2, 0x69, 0xeb, 0x26, 0xa4, 0x3f, 0xbd, 0x14, 0x96, 0xd, 0x8f, 0xe, 0x8c, 0x17, 0x95, 0x3c, 0xbe, 0x25, 0xa7, 0x6a, 0xe8, 0x73, 0xf1, 0x58, 0xda, 0x41, 0xc3, 0xc6, 0x44, 0xdf, 0x5d, 0xf4, 0x76, 0xed, 0x6f, 0xa2, 0x20, 0xbb, 0x39, 0x90, 0x12, 0x89, 0xb, 0x83, 0x1, 0x9a, 0x18, 0xb1, 0x33, 0xa8, 0x2a, 0xe7, 0x65, 0xfe, 0x7c, 0xd5, 0x57, 0xcc, 0x4e, 0x4b, 0xc9, 0x52, 0xd0, 0x79, 0xfb, 0x60, 0xe2, 0x2f, 0xad, 0x36, 0xb4, 0x1d, 0x9f, 0x4, 0x86, 0x9, 0x8b, 0x10, 0x92, 0x3b, 0xb9, 0x22, 0xa0, 0x6d, 0xef, 0x74, 0xf6, 0x5f, 0xdd, 0x46, 0xc4, 0xc1, 0x43, 0xd8, 0x5a, 0xf3, 0x71, 0xea, 0x68, 0xa5, 0x27, 0xbc, 0x3e, 0x97, 0x15, 0x8e, 0xc, 0x84, 0x6, 0x9d, 0x1f, 0xb6, 0x34, 0xaf, 0x2d, 0xe0, 0x62, 0xf9, 0x7b, 0xd2, 0x50, 0xcb, 0x49, 0x4c, 0xce, 0x55, 0xd7, 0x7e, 0xfc, 0x67, 0xe5, 0x28, 0xaa, 0x31, 0xb3, 0x1a, 0x98, 0x3, 0x81}, + {0x0, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2, 0xd8, 0x5b, 0xc3, 0x40, 0xee, 0x6d, 0xf5, 0x76, 0xb4, 0x37, 0xaf, 0x2c, 0x82, 0x1, 0x99, 0x1a, 0xad, 0x2e, 0xb6, 0x35, 0x9b, 0x18, 0x80, 0x3, 0xc1, 0x42, 0xda, 0x59, 0xf7, 0x74, 0xec, 0x6f, 0x75, 0xf6, 0x6e, 0xed, 0x43, 0xc0, 0x58, 0xdb, 0x19, 0x9a, 0x2, 0x81, 0x2f, 0xac, 0x34, 0xb7, 0x47, 0xc4, 0x5c, 0xdf, 0x71, 0xf2, 0x6a, 0xe9, 0x2b, 0xa8, 0x30, 0xb3, 0x1d, 0x9e, 0x6, 0x85, 0x9f, 0x1c, 0x84, 0x7, 0xa9, 0x2a, 0xb2, 0x31, 0xf3, 0x70, 0xe8, 0x6b, 0xc5, 0x46, 0xde, 0x5d, 0xea, 0x69, 0xf1, 0x72, 0xdc, 0x5f, 0xc7, 0x44, 0x86, 0x5, 0x9d, 0x1e, 0xb0, 0x33, 0xab, 0x28, 0x32, 0xb1, 0x29, 0xaa, 0x4, 0x87, 0x1f, 0x9c, 0x5e, 0xdd, 0x45, 0xc6, 0x68, 0xeb, 0x73, 0xf0, 0x8e, 0xd, 0x95, 0x16, 0xb8, 0x3b, 0xa3, 0x20, 0xe2, 0x61, 0xf9, 0x7a, 0xd4, 0x57, 0xcf, 0x4c, 0x56, 0xd5, 0x4d, 0xce, 0x60, 0xe3, 0x7b, 0xf8, 0x3a, 0xb9, 0x21, 0xa2, 0xc, 0x8f, 0x17, 0x94, 0x23, 0xa0, 0x38, 0xbb, 0x15, 0x96, 0xe, 0x8d, 0x4f, 0xcc, 0x54, 0xd7, 0x79, 0xfa, 0x62, 0xe1, 0xfb, 0x78, 0xe0, 0x63, 0xcd, 0x4e, 0xd6, 0x55, 0x97, 0x14, 0x8c, 0xf, 0xa1, 0x22, 0xba, 0x39, 0xc9, 0x4a, 0xd2, 0x51, 0xff, 0x7c, 0xe4, 0x67, 0xa5, 0x26, 0xbe, 0x3d, 0x93, 0x10, 0x88, 0xb, 0x11, 0x92, 0xa, 0x89, 0x27, 0xa4, 0x3c, 0xbf, 0x7d, 0xfe, 0x66, 0xe5, 0x4b, 0xc8, 0x50, 0xd3, 0x64, 0xe7, 0x7f, 0xfc, 0x52, 0xd1, 0x49, 0xca, 0x8, 0x8b, 0x13, 0x90, 0x3e, 0xbd, 0x25, 0xa6, 0xbc, 0x3f, 0xa7, 0x24, 0x8a, 0x9, 0x91, 0x12, 0xd0, 0x53, 0xcb, 0x48, 0xe6, 0x65, 0xfd, 0x7e}, + {0x0, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef, 0xa8, 0x2c, 0xbd, 0x39, 0x82, 0x6, 0x97, 0x13, 0xfc, 0x78, 0xe9, 0x6d, 0xd6, 0x52, 0xc3, 0x47, 0x4d, 0xc9, 0x58, 0xdc, 0x67, 0xe3, 0x72, 0xf6, 0x19, 0x9d, 0xc, 0x88, 0x33, 0xb7, 0x26, 0xa2, 0xe5, 0x61, 0xf0, 0x74, 0xcf, 0x4b, 0xda, 0x5e, 0xb1, 0x35, 0xa4, 0x20, 0x9b, 0x1f, 0x8e, 0xa, 0x9a, 0x1e, 0x8f, 0xb, 0xb0, 0x34, 0xa5, 0x21, 0xce, 0x4a, 0xdb, 0x5f, 0xe4, 0x60, 0xf1, 0x75, 0x32, 0xb6, 0x27, 0xa3, 0x18, 0x9c, 0xd, 0x89, 0x66, 0xe2, 0x73, 0xf7, 0x4c, 0xc8, 0x59, 0xdd, 0xd7, 0x53, 0xc2, 0x46, 0xfd, 0x79, 0xe8, 0x6c, 0x83, 0x7, 0x96, 0x12, 0xa9, 0x2d, 0xbc, 0x38, 0x7f, 0xfb, 0x6a, 0xee, 0x55, 0xd1, 0x40, 0xc4, 0x2b, 0xaf, 0x3e, 0xba, 0x1, 0x85, 0x14, 0x90, 0x29, 0xad, 0x3c, 0xb8, 0x3, 0x87, 0x16, 0x92, 0x7d, 0xf9, 0x68, 0xec, 0x57, 0xd3, 0x42, 0xc6, 0x81, 0x5, 0x94, 0x10, 0xab, 0x2f, 0xbe, 0x3a, 0xd5, 0x51, 0xc0, 0x44, 0xff, 0x7b, 0xea, 0x6e, 0x64, 0xe0, 0x71, 0xf5, 0x4e, 0xca, 0x5b, 0xdf, 0x30, 0xb4, 0x25, 0xa1, 0x1a, 0x9e, 0xf, 0x8b, 0xcc, 0x48, 0xd9, 0x5d, 0xe6, 0x62, 0xf3, 0x77, 0x98, 0x1c, 0x8d, 0x9, 0xb2, 0x36, 0xa7, 0x23, 0xb3, 0x37, 0xa6, 0x22, 0x99, 0x1d, 0x8c, 0x8, 0xe7, 0x63, 0xf2, 0x76, 0xcd, 0x49, 0xd8, 0x5c, 0x1b, 0x9f, 0xe, 0x8a, 0x31, 0xb5, 0x24, 0xa0, 0x4f, 0xcb, 0x5a, 0xde, 0x65, 0xe1, 0x70, 0xf4, 0xfe, 0x7a, 0xeb, 0x6f, 0xd4, 0x50, 0xc1, 0x45, 0xaa, 0x2e, 0xbf, 0x3b, 0x80, 0x4, 0x95, 0x11, 0x56, 0xd2, 0x43, 0xc7, 0x7c, 0xf8, 0x69, 0xed, 0x2, 0x86, 0x17, 0x93, 0x28, 0xac, 0x3d, 0xb9}, + {0x0, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0, 0xb8, 0x3d, 0xaf, 0x2a, 0x96, 0x13, 0x81, 0x4, 0xe4, 0x61, 0xf3, 0x76, 0xca, 0x4f, 0xdd, 0x58, 0x6d, 0xe8, 0x7a, 0xff, 0x43, 0xc6, 0x54, 0xd1, 0x31, 0xb4, 0x26, 0xa3, 0x1f, 0x9a, 0x8, 0x8d, 0xd5, 0x50, 0xc2, 0x47, 0xfb, 0x7e, 0xec, 0x69, 0x89, 0xc, 0x9e, 0x1b, 0xa7, 0x22, 0xb0, 0x35, 0xda, 0x5f, 0xcd, 0x48, 0xf4, 0x71, 0xe3, 0x66, 0x86, 0x3, 0x91, 0x14, 0xa8, 0x2d, 0xbf, 0x3a, 0x62, 0xe7, 0x75, 0xf0, 0x4c, 0xc9, 0x5b, 0xde, 0x3e, 0xbb, 0x29, 0xac, 0x10, 0x95, 0x7, 0x82, 0xb7, 0x32, 0xa0, 0x25, 0x99, 0x1c, 0x8e, 0xb, 0xeb, 0x6e, 0xfc, 0x79, 0xc5, 0x40, 0xd2, 0x57, 0xf, 0x8a, 0x18, 0x9d, 0x21, 0xa4, 0x36, 0xb3, 0x53, 0xd6, 0x44, 0xc1, 0x7d, 0xf8, 0x6a, 0xef, 0xa9, 0x2c, 0xbe, 0x3b, 0x87, 0x2, 0x90, 0x15, 0xf5, 0x70, 0xe2, 0x67, 0xdb, 0x5e, 0xcc, 0x49, 0x11, 0x94, 0x6, 0x83, 0x3f, 0xba, 0x28, 0xad, 0x4d, 0xc8, 0x5a, 0xdf, 0x63, 0xe6, 0x74, 0xf1, 0xc4, 0x41, 0xd3, 0x56, 0xea, 0x6f, 0xfd, 0x78, 0x98, 0x1d, 0x8f, 0xa, 0xb6, 0x33, 0xa1, 0x24, 0x7c, 0xf9, 0x6b, 0xee, 0x52, 0xd7, 0x45, 0xc0, 0x20, 0xa5, 0x37, 0xb2, 0xe, 0x8b, 0x19, 0x9c, 0x73, 0xf6, 0x64, 0xe1, 0x5d, 0xd8, 0x4a, 0xcf, 0x2f, 0xaa, 0x38, 0xbd, 0x1, 0x84, 0x16, 0x93, 0xcb, 0x4e, 0xdc, 0x59, 0xe5, 0x60, 0xf2, 0x77, 0x97, 0x12, 0x80, 0x5, 0xb9, 0x3c, 0xae, 0x2b, 0x1e, 0x9b, 0x9, 0x8c, 0x30, 0xb5, 0x27, 0xa2, 0x42, 0xc7, 0x55, 0xd0, 0x6c, 0xe9, 0x7b, 0xfe, 0xa6, 0x23, 0xb1, 0x34, 0x88, 0xd, 0x9f, 0x1a, 0xfa, 0x7f, 0xed, 0x68, 0xd4, 0x51, 0xc3, 0x46}, + {0x0, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1, 0x88, 0xe, 0x99, 0x1f, 0xaa, 0x2c, 0xbb, 0x3d, 0xcc, 0x4a, 0xdd, 0x5b, 0xee, 0x68, 0xff, 0x79, 0xd, 0x8b, 0x1c, 0x9a, 0x2f, 0xa9, 0x3e, 0xb8, 0x49, 0xcf, 0x58, 0xde, 0x6b, 0xed, 0x7a, 0xfc, 0x85, 0x3, 0x94, 0x12, 0xa7, 0x21, 0xb6, 0x30, 0xc1, 0x47, 0xd0, 0x56, 0xe3, 0x65, 0xf2, 0x74, 0x1a, 0x9c, 0xb, 0x8d, 0x38, 0xbe, 0x29, 0xaf, 0x5e, 0xd8, 0x4f, 0xc9, 0x7c, 0xfa, 0x6d, 0xeb, 0x92, 0x14, 0x83, 0x5, 0xb0, 0x36, 0xa1, 0x27, 0xd6, 0x50, 0xc7, 0x41, 0xf4, 0x72, 0xe5, 0x63, 0x17, 0x91, 0x6, 0x80, 0x35, 0xb3, 0x24, 0xa2, 0x53, 0xd5, 0x42, 0xc4, 0x71, 0xf7, 0x60, 0xe6, 0x9f, 0x19, 0x8e, 0x8, 0xbd, 0x3b, 0xac, 0x2a, 0xdb, 0x5d, 0xca, 0x4c, 0xf9, 0x7f, 0xe8, 0x6e, 0x34, 0xb2, 0x25, 0xa3, 0x16, 0x90, 0x7, 0x81, 0x70, 0xf6, 0x61, 0xe7, 0x52, 0xd4, 0x43, 0xc5, 0xbc, 0x3a, 0xad, 0x2b, 0x9e, 0x18, 0x8f, 0x9, 0xf8, 0x7e, 0xe9, 0x6f, 0xda, 0x5c, 0xcb, 0x4d, 0x39, 0xbf, 0x28, 0xae, 0x1b, 0x9d, 0xa, 0x8c, 0x7d, 0xfb, 0x6c, 0xea, 0x5f, 0xd9, 0x4e, 0xc8, 0xb1, 0x37, 0xa0, 0x26, 0x93, 0x15, 0x82, 0x4, 0xf5, 0x73, 0xe4, 0x62, 0xd7, 0x51, 0xc6, 0x40, 0x2e, 0xa8, 0x3f, 0xb9, 0xc, 0x8a, 0x1d, 0x9b, 0x6a, 0xec, 0x7b, 0xfd, 0x48, 0xce, 0x59, 0xdf, 0xa6, 0x20, 0xb7, 0x31, 0x84, 0x2, 0x95, 0x13, 0xe2, 0x64, 0xf3, 0x75, 0xc0, 0x46, 0xd1, 0x57, 0x23, 0xa5, 0x32, 0xb4, 0x1, 0x87, 0x10, 0x96, 0x67, 0xe1, 0x76, 0xf0, 0x45, 0xc3, 0x54, 0xd2, 0xab, 0x2d, 0xba, 0x3c, 0x89, 0xf, 0x98, 0x1e, 0xef, 0x69, 0xfe, 0x78, 0xcd, 0x4b, 0xdc, 0x5a}, + {0x0, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe, 0x98, 0x1f, 0x8b, 0xc, 0xbe, 0x39, 0xad, 0x2a, 0xd4, 0x53, 0xc7, 0x40, 0xf2, 0x75, 0xe1, 0x66, 0x2d, 0xaa, 0x3e, 0xb9, 0xb, 0x8c, 0x18, 0x9f, 0x61, 0xe6, 0x72, 0xf5, 0x47, 0xc0, 0x54, 0xd3, 0xb5, 0x32, 0xa6, 0x21, 0x93, 0x14, 0x80, 0x7, 0xf9, 0x7e, 0xea, 0x6d, 0xdf, 0x58, 0xcc, 0x4b, 0x5a, 0xdd, 0x49, 0xce, 0x7c, 0xfb, 0x6f, 0xe8, 0x16, 0x91, 0x5, 0x82, 0x30, 0xb7, 0x23, 0xa4, 0xc2, 0x45, 0xd1, 0x56, 0xe4, 0x63, 0xf7, 0x70, 0x8e, 0x9, 0x9d, 0x1a, 0xa8, 0x2f, 0xbb, 0x3c, 0x77, 0xf0, 0x64, 0xe3, 0x51, 0xd6, 0x42, 0xc5, 0x3b, 0xbc, 0x28, 0xaf, 0x1d, 0x9a, 0xe, 0x89, 0xef, 0x68, 0xfc, 0x7b, 0xc9, 0x4e, 0xda, 0x5d, 0xa3, 0x24, 0xb0, 0x37, 0x85, 0x2, 0x96, 0x11, 0xb4, 0x33, 0xa7, 0x20, 0x92, 0x15, 0x81, 0x6, 0xf8, 0x7f, 0xeb, 0x6c, 0xde, 0x59, 0xcd, 0x4a, 0x2c, 0xab, 0x3f, 0xb8, 0xa, 0x8d, 0x19, 0x9e, 0x60, 0xe7, 0x73, 0xf4, 0x46, 0xc1, 0x55, 0xd2, 0x99, 0x1e, 0x8a, 0xd, 0xbf, 0x38, 0xac, 0x2b, 0xd5, 0x52, 0xc6, 0x41, 0xf3, 0x74, 0xe0, 0x67, 0x1, 0x86, 0x12, 0x95, 0x27, 0xa0, 0x34, 0xb3, 0x4d, 0xca, 0x5e, 0xd9, 0x6b, 0xec, 0x78, 0xff, 0xee, 0x69, 0xfd, 0x7a, 0xc8, 0x4f, 0xdb, 0x5c, 0xa2, 0x25, 0xb1, 0x36, 0x84, 0x3, 0x97, 0x10, 0x76, 0xf1, 0x65, 0xe2, 0x50, 0xd7, 0x43, 0xc4, 0x3a, 0xbd, 0x29, 0xae, 0x1c, 0x9b, 0xf, 0x88, 0xc3, 0x44, 0xd0, 0x57, 0xe5, 0x62, 0xf6, 0x71, 0x8f, 0x8, 0x9c, 0x1b, 0xa9, 0x2e, 0xba, 0x3d, 0x5b, 0xdc, 0x48, 0xcf, 0x7d, 0xfa, 0x6e, 0xe9, 0x17, 0x90, 0x4, 0x83, 0x31, 0xb6, 0x22, 0xa5}, + {0x0, 0x88, 0xd, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab, 0x68, 0xe0, 0x65, 0xed, 0x72, 0xfa, 0x7f, 0xf7, 0x5c, 0xd4, 0x51, 0xd9, 0x46, 0xce, 0x4b, 0xc3, 0xd0, 0x58, 0xdd, 0x55, 0xca, 0x42, 0xc7, 0x4f, 0xe4, 0x6c, 0xe9, 0x61, 0xfe, 0x76, 0xf3, 0x7b, 0xb8, 0x30, 0xb5, 0x3d, 0xa2, 0x2a, 0xaf, 0x27, 0x8c, 0x4, 0x81, 0x9, 0x96, 0x1e, 0x9b, 0x13, 0xbd, 0x35, 0xb0, 0x38, 0xa7, 0x2f, 0xaa, 0x22, 0x89, 0x1, 0x84, 0xc, 0x93, 0x1b, 0x9e, 0x16, 0xd5, 0x5d, 0xd8, 0x50, 0xcf, 0x47, 0xc2, 0x4a, 0xe1, 0x69, 0xec, 0x64, 0xfb, 0x73, 0xf6, 0x7e, 0x6d, 0xe5, 0x60, 0xe8, 0x77, 0xff, 0x7a, 0xf2, 0x59, 0xd1, 0x54, 0xdc, 0x43, 0xcb, 0x4e, 0xc6, 0x5, 0x8d, 0x8, 0x80, 0x1f, 0x97, 0x12, 0x9a, 0x31, 0xb9, 0x3c, 0xb4, 0x2b, 0xa3, 0x26, 0xae, 0x67, 0xef, 0x6a, 0xe2, 0x7d, 0xf5, 0x70, 0xf8, 0x53, 0xdb, 0x5e, 0xd6, 0x49, 0xc1, 0x44, 0xcc, 0xf, 0x87, 0x2, 0x8a, 0x15, 0x9d, 0x18, 0x90, 0x3b, 0xb3, 0x36, 0xbe, 0x21, 0xa9, 0x2c, 0xa4, 0xb7, 0x3f, 0xba, 0x32, 0xad, 0x25, 0xa0, 0x28, 0x83, 0xb, 0x8e, 0x6, 0x99, 0x11, 0x94, 0x1c, 0xdf, 0x57, 0xd2, 0x5a, 0xc5, 0x4d, 0xc8, 0x40, 0xeb, 0x63, 0xe6, 0x6e, 0xf1, 0x79, 0xfc, 0x74, 0xda, 0x52, 0xd7, 0x5f, 0xc0, 0x48, 0xcd, 0x45, 0xee, 0x66, 0xe3, 0x6b, 0xf4, 0x7c, 0xf9, 0x71, 0xb2, 0x3a, 0xbf, 0x37, 0xa8, 0x20, 0xa5, 0x2d, 0x86, 0xe, 0x8b, 0x3, 0x9c, 0x14, 0x91, 0x19, 0xa, 0x82, 0x7, 0x8f, 0x10, 0x98, 0x1d, 0x95, 0x3e, 0xb6, 0x33, 0xbb, 0x24, 0xac, 0x29, 0xa1, 0x62, 0xea, 0x6f, 0xe7, 0x78, 0xf0, 0x75, 0xfd, 0x56, 0xde, 0x5b, 0xd3, 0x4c, 0xc4, 0x41, 0xc9}, + {0x0, 0x89, 0xf, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4, 0x78, 0xf1, 0x77, 0xfe, 0x66, 0xef, 0x69, 0xe0, 0x44, 0xcd, 0x4b, 0xc2, 0x5a, 0xd3, 0x55, 0xdc, 0xf0, 0x79, 0xff, 0x76, 0xee, 0x67, 0xe1, 0x68, 0xcc, 0x45, 0xc3, 0x4a, 0xd2, 0x5b, 0xdd, 0x54, 0x88, 0x1, 0x87, 0xe, 0x96, 0x1f, 0x99, 0x10, 0xb4, 0x3d, 0xbb, 0x32, 0xaa, 0x23, 0xa5, 0x2c, 0xfd, 0x74, 0xf2, 0x7b, 0xe3, 0x6a, 0xec, 0x65, 0xc1, 0x48, 0xce, 0x47, 0xdf, 0x56, 0xd0, 0x59, 0x85, 0xc, 0x8a, 0x3, 0x9b, 0x12, 0x94, 0x1d, 0xb9, 0x30, 0xb6, 0x3f, 0xa7, 0x2e, 0xa8, 0x21, 0xd, 0x84, 0x2, 0x8b, 0x13, 0x9a, 0x1c, 0x95, 0x31, 0xb8, 0x3e, 0xb7, 0x2f, 0xa6, 0x20, 0xa9, 0x75, 0xfc, 0x7a, 0xf3, 0x6b, 0xe2, 0x64, 0xed, 0x49, 0xc0, 0x46, 0xcf, 0x57, 0xde, 0x58, 0xd1, 0xe7, 0x6e, 0xe8, 0x61, 0xf9, 0x70, 0xf6, 0x7f, 0xdb, 0x52, 0xd4, 0x5d, 0xc5, 0x4c, 0xca, 0x43, 0x9f, 0x16, 0x90, 0x19, 0x81, 0x8, 0x8e, 0x7, 0xa3, 0x2a, 0xac, 0x25, 0xbd, 0x34, 0xb2, 0x3b, 0x17, 0x9e, 0x18, 0x91, 0x9, 0x80, 0x6, 0x8f, 0x2b, 0xa2, 0x24, 0xad, 0x35, 0xbc, 0x3a, 0xb3, 0x6f, 0xe6, 0x60, 0xe9, 0x71, 0xf8, 0x7e, 0xf7, 0x53, 0xda, 0x5c, 0xd5, 0x4d, 0xc4, 0x42, 0xcb, 0x1a, 0x93, 0x15, 0x9c, 0x4, 0x8d, 0xb, 0x82, 0x26, 0xaf, 0x29, 0xa0, 0x38, 0xb1, 0x37, 0xbe, 0x62, 0xeb, 0x6d, 0xe4, 0x7c, 0xf5, 0x73, 0xfa, 0x5e, 0xd7, 0x51, 0xd8, 0x40, 0xc9, 0x4f, 0xc6, 0xea, 0x63, 0xe5, 0x6c, 0xf4, 0x7d, 0xfb, 0x72, 0xd6, 0x5f, 0xd9, 0x50, 0xc8, 0x41, 0xc7, 0x4e, 0x92, 0x1b, 0x9d, 0x14, 0x8c, 0x5, 0x83, 0xa, 0xae, 0x27, 0xa1, 0x28, 0xb0, 0x39, 0xbf, 0x36}, + {0x0, 0x8a, 0x9, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5, 0x48, 0xc2, 0x41, 0xcb, 0x5a, 0xd0, 0x53, 0xd9, 0x6c, 0xe6, 0x65, 0xef, 0x7e, 0xf4, 0x77, 0xfd, 0x90, 0x1a, 0x99, 0x13, 0x82, 0x8, 0x8b, 0x1, 0xb4, 0x3e, 0xbd, 0x37, 0xa6, 0x2c, 0xaf, 0x25, 0xd8, 0x52, 0xd1, 0x5b, 0xca, 0x40, 0xc3, 0x49, 0xfc, 0x76, 0xf5, 0x7f, 0xee, 0x64, 0xe7, 0x6d, 0x3d, 0xb7, 0x34, 0xbe, 0x2f, 0xa5, 0x26, 0xac, 0x19, 0x93, 0x10, 0x9a, 0xb, 0x81, 0x2, 0x88, 0x75, 0xff, 0x7c, 0xf6, 0x67, 0xed, 0x6e, 0xe4, 0x51, 0xdb, 0x58, 0xd2, 0x43, 0xc9, 0x4a, 0xc0, 0xad, 0x27, 0xa4, 0x2e, 0xbf, 0x35, 0xb6, 0x3c, 0x89, 0x3, 0x80, 0xa, 0x9b, 0x11, 0x92, 0x18, 0xe5, 0x6f, 0xec, 0x66, 0xf7, 0x7d, 0xfe, 0x74, 0xc1, 0x4b, 0xc8, 0x42, 0xd3, 0x59, 0xda, 0x50, 0x7a, 0xf0, 0x73, 0xf9, 0x68, 0xe2, 0x61, 0xeb, 0x5e, 0xd4, 0x57, 0xdd, 0x4c, 0xc6, 0x45, 0xcf, 0x32, 0xb8, 0x3b, 0xb1, 0x20, 0xaa, 0x29, 0xa3, 0x16, 0x9c, 0x1f, 0x95, 0x4, 0x8e, 0xd, 0x87, 0xea, 0x60, 0xe3, 0x69, 0xf8, 0x72, 0xf1, 0x7b, 0xce, 0x44, 0xc7, 0x4d, 0xdc, 0x56, 0xd5, 0x5f, 0xa2, 0x28, 0xab, 0x21, 0xb0, 0x3a, 0xb9, 0x33, 0x86, 0xc, 0x8f, 0x5, 0x94, 0x1e, 0x9d, 0x17, 0x47, 0xcd, 0x4e, 0xc4, 0x55, 0xdf, 0x5c, 0xd6, 0x63, 0xe9, 0x6a, 0xe0, 0x71, 0xfb, 0x78, 0xf2, 0xf, 0x85, 0x6, 0x8c, 0x1d, 0x97, 0x14, 0x9e, 0x2b, 0xa1, 0x22, 0xa8, 0x39, 0xb3, 0x30, 0xba, 0xd7, 0x5d, 0xde, 0x54, 0xc5, 0x4f, 0xcc, 0x46, 0xf3, 0x79, 0xfa, 0x70, 0xe1, 0x6b, 0xe8, 0x62, 0x9f, 0x15, 0x96, 0x1c, 0x8d, 0x7, 0x84, 0xe, 0xbb, 0x31, 0xb2, 0x38, 0xa9, 0x23, 0xa0, 0x2a}, + {0x0, 0x8b, 0xb, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba, 0x58, 0xd3, 0x53, 0xd8, 0x4e, 0xc5, 0x45, 0xce, 0x74, 0xff, 0x7f, 0xf4, 0x62, 0xe9, 0x69, 0xe2, 0xb0, 0x3b, 0xbb, 0x30, 0xa6, 0x2d, 0xad, 0x26, 0x9c, 0x17, 0x97, 0x1c, 0x8a, 0x1, 0x81, 0xa, 0xe8, 0x63, 0xe3, 0x68, 0xfe, 0x75, 0xf5, 0x7e, 0xc4, 0x4f, 0xcf, 0x44, 0xd2, 0x59, 0xd9, 0x52, 0x7d, 0xf6, 0x76, 0xfd, 0x6b, 0xe0, 0x60, 0xeb, 0x51, 0xda, 0x5a, 0xd1, 0x47, 0xcc, 0x4c, 0xc7, 0x25, 0xae, 0x2e, 0xa5, 0x33, 0xb8, 0x38, 0xb3, 0x9, 0x82, 0x2, 0x89, 0x1f, 0x94, 0x14, 0x9f, 0xcd, 0x46, 0xc6, 0x4d, 0xdb, 0x50, 0xd0, 0x5b, 0xe1, 0x6a, 0xea, 0x61, 0xf7, 0x7c, 0xfc, 0x77, 0x95, 0x1e, 0x9e, 0x15, 0x83, 0x8, 0x88, 0x3, 0xb9, 0x32, 0xb2, 0x39, 0xaf, 0x24, 0xa4, 0x2f, 0xfa, 0x71, 0xf1, 0x7a, 0xec, 0x67, 0xe7, 0x6c, 0xd6, 0x5d, 0xdd, 0x56, 0xc0, 0x4b, 0xcb, 0x40, 0xa2, 0x29, 0xa9, 0x22, 0xb4, 0x3f, 0xbf, 0x34, 0x8e, 0x5, 0x85, 0xe, 0x98, 0x13, 0x93, 0x18, 0x4a, 0xc1, 0x41, 0xca, 0x5c, 0xd7, 0x57, 0xdc, 0x66, 0xed, 0x6d, 0xe6, 0x70, 0xfb, 0x7b, 0xf0, 0x12, 0x99, 0x19, 0x92, 0x4, 0x8f, 0xf, 0x84, 0x3e, 0xb5, 0x35, 0xbe, 0x28, 0xa3, 0x23, 0xa8, 0x87, 0xc, 0x8c, 0x7, 0x91, 0x1a, 0x9a, 0x11, 0xab, 0x20, 0xa0, 0x2b, 0xbd, 0x36, 0xb6, 0x3d, 0xdf, 0x54, 0xd4, 0x5f, 0xc9, 0x42, 0xc2, 0x49, 0xf3, 0x78, 0xf8, 0x73, 0xe5, 0x6e, 0xee, 0x65, 0x37, 0xbc, 0x3c, 0xb7, 0x21, 0xaa, 0x2a, 0xa1, 0x1b, 0x90, 0x10, 0x9b, 0xd, 0x86, 0x6, 0x8d, 0x6f, 0xe4, 0x64, 0xef, 0x79, 0xf2, 0x72, 0xf9, 0x43, 0xc8, 0x48, 0xc3, 0x55, 0xde, 0x5e, 0xd5}, + {0x0, 0x8c, 0x5, 0x89, 0xa, 0x86, 0xf, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97, 0x28, 0xa4, 0x2d, 0xa1, 0x22, 0xae, 0x27, 0xab, 0x3c, 0xb0, 0x39, 0xb5, 0x36, 0xba, 0x33, 0xbf, 0x50, 0xdc, 0x55, 0xd9, 0x5a, 0xd6, 0x5f, 0xd3, 0x44, 0xc8, 0x41, 0xcd, 0x4e, 0xc2, 0x4b, 0xc7, 0x78, 0xf4, 0x7d, 0xf1, 0x72, 0xfe, 0x77, 0xfb, 0x6c, 0xe0, 0x69, 0xe5, 0x66, 0xea, 0x63, 0xef, 0xa0, 0x2c, 0xa5, 0x29, 0xaa, 0x26, 0xaf, 0x23, 0xb4, 0x38, 0xb1, 0x3d, 0xbe, 0x32, 0xbb, 0x37, 0x88, 0x4, 0x8d, 0x1, 0x82, 0xe, 0x87, 0xb, 0x9c, 0x10, 0x99, 0x15, 0x96, 0x1a, 0x93, 0x1f, 0xf0, 0x7c, 0xf5, 0x79, 0xfa, 0x76, 0xff, 0x73, 0xe4, 0x68, 0xe1, 0x6d, 0xee, 0x62, 0xeb, 0x67, 0xd8, 0x54, 0xdd, 0x51, 0xd2, 0x5e, 0xd7, 0x5b, 0xcc, 0x40, 0xc9, 0x45, 0xc6, 0x4a, 0xc3, 0x4f, 0x5d, 0xd1, 0x58, 0xd4, 0x57, 0xdb, 0x52, 0xde, 0x49, 0xc5, 0x4c, 0xc0, 0x43, 0xcf, 0x46, 0xca, 0x75, 0xf9, 0x70, 0xfc, 0x7f, 0xf3, 0x7a, 0xf6, 0x61, 0xed, 0x64, 0xe8, 0x6b, 0xe7, 0x6e, 0xe2, 0xd, 0x81, 0x8, 0x84, 0x7, 0x8b, 0x2, 0x8e, 0x19, 0x95, 0x1c, 0x90, 0x13, 0x9f, 0x16, 0x9a, 0x25, 0xa9, 0x20, 0xac, 0x2f, 0xa3, 0x2a, 0xa6, 0x31, 0xbd, 0x34, 0xb8, 0x3b, 0xb7, 0x3e, 0xb2, 0xfd, 0x71, 0xf8, 0x74, 0xf7, 0x7b, 0xf2, 0x7e, 0xe9, 0x65, 0xec, 0x60, 0xe3, 0x6f, 0xe6, 0x6a, 0xd5, 0x59, 0xd0, 0x5c, 0xdf, 0x53, 0xda, 0x56, 0xc1, 0x4d, 0xc4, 0x48, 0xcb, 0x47, 0xce, 0x42, 0xad, 0x21, 0xa8, 0x24, 0xa7, 0x2b, 0xa2, 0x2e, 0xb9, 0x35, 0xbc, 0x30, 0xb3, 0x3f, 0xb6, 0x3a, 0x85, 0x9, 0x80, 0xc, 0x8f, 0x3, 0x8a, 0x6, 0x91, 0x1d, 0x94, 0x18, 0x9b, 0x17, 0x9e, 0x12}, + {0x0, 0x8d, 0x7, 0x8a, 0xe, 0x83, 0x9, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98, 0x38, 0xb5, 0x3f, 0xb2, 0x36, 0xbb, 0x31, 0xbc, 0x24, 0xa9, 0x23, 0xae, 0x2a, 0xa7, 0x2d, 0xa0, 0x70, 0xfd, 0x77, 0xfa, 0x7e, 0xf3, 0x79, 0xf4, 0x6c, 0xe1, 0x6b, 0xe6, 0x62, 0xef, 0x65, 0xe8, 0x48, 0xc5, 0x4f, 0xc2, 0x46, 0xcb, 0x41, 0xcc, 0x54, 0xd9, 0x53, 0xde, 0x5a, 0xd7, 0x5d, 0xd0, 0xe0, 0x6d, 0xe7, 0x6a, 0xee, 0x63, 0xe9, 0x64, 0xfc, 0x71, 0xfb, 0x76, 0xf2, 0x7f, 0xf5, 0x78, 0xd8, 0x55, 0xdf, 0x52, 0xd6, 0x5b, 0xd1, 0x5c, 0xc4, 0x49, 0xc3, 0x4e, 0xca, 0x47, 0xcd, 0x40, 0x90, 0x1d, 0x97, 0x1a, 0x9e, 0x13, 0x99, 0x14, 0x8c, 0x1, 0x8b, 0x6, 0x82, 0xf, 0x85, 0x8, 0xa8, 0x25, 0xaf, 0x22, 0xa6, 0x2b, 0xa1, 0x2c, 0xb4, 0x39, 0xb3, 0x3e, 0xba, 0x37, 0xbd, 0x30, 0xdd, 0x50, 0xda, 0x57, 0xd3, 0x5e, 0xd4, 0x59, 0xc1, 0x4c, 0xc6, 0x4b, 0xcf, 0x42, 0xc8, 0x45, 0xe5, 0x68, 0xe2, 0x6f, 0xeb, 0x66, 0xec, 0x61, 0xf9, 0x74, 0xfe, 0x73, 0xf7, 0x7a, 0xf0, 0x7d, 0xad, 0x20, 0xaa, 0x27, 0xa3, 0x2e, 0xa4, 0x29, 0xb1, 0x3c, 0xb6, 0x3b, 0xbf, 0x32, 0xb8, 0x35, 0x95, 0x18, 0x92, 0x1f, 0x9b, 0x16, 0x9c, 0x11, 0x89, 0x4, 0x8e, 0x3, 0x87, 0xa, 0x80, 0xd, 0x3d, 0xb0, 0x3a, 0xb7, 0x33, 0xbe, 0x34, 0xb9, 0x21, 0xac, 0x26, 0xab, 0x2f, 0xa2, 0x28, 0xa5, 0x5, 0x88, 0x2, 0x8f, 0xb, 0x86, 0xc, 0x81, 0x19, 0x94, 0x1e, 0x93, 0x17, 0x9a, 0x10, 0x9d, 0x4d, 0xc0, 0x4a, 0xc7, 0x43, 0xce, 0x44, 0xc9, 0x51, 0xdc, 0x56, 0xdb, 0x5f, 0xd2, 0x58, 0xd5, 0x75, 0xf8, 0x72, 0xff, 0x7b, 0xf6, 0x7c, 0xf1, 0x69, 0xe4, 0x6e, 0xe3, 0x67, 0xea, 0x60, 0xed}, + {0x0, 0x8e, 0x1, 0x8f, 0x2, 0x8c, 0x3, 0x8d, 0x4, 0x8a, 0x5, 0x8b, 0x6, 0x88, 0x7, 0x89, 0x8, 0x86, 0x9, 0x87, 0xa, 0x84, 0xb, 0x85, 0xc, 0x82, 0xd, 0x83, 0xe, 0x80, 0xf, 0x81, 0x10, 0x9e, 0x11, 0x9f, 0x12, 0x9c, 0x13, 0x9d, 0x14, 0x9a, 0x15, 0x9b, 0x16, 0x98, 0x17, 0x99, 0x18, 0x96, 0x19, 0x97, 0x1a, 0x94, 0x1b, 0x95, 0x1c, 0x92, 0x1d, 0x93, 0x1e, 0x90, 0x1f, 0x91, 0x20, 0xae, 0x21, 0xaf, 0x22, 0xac, 0x23, 0xad, 0x24, 0xaa, 0x25, 0xab, 0x26, 0xa8, 0x27, 0xa9, 0x28, 0xa6, 0x29, 0xa7, 0x2a, 0xa4, 0x2b, 0xa5, 0x2c, 0xa2, 0x2d, 0xa3, 0x2e, 0xa0, 0x2f, 0xa1, 0x30, 0xbe, 0x31, 0xbf, 0x32, 0xbc, 0x33, 0xbd, 0x34, 0xba, 0x35, 0xbb, 0x36, 0xb8, 0x37, 0xb9, 0x38, 0xb6, 0x39, 0xb7, 0x3a, 0xb4, 0x3b, 0xb5, 0x3c, 0xb2, 0x3d, 0xb3, 0x3e, 0xb0, 0x3f, 0xb1, 0x40, 0xce, 0x41, 0xcf, 0x42, 0xcc, 0x43, 0xcd, 0x44, 0xca, 0x45, 0xcb, 0x46, 0xc8, 0x47, 0xc9, 0x48, 0xc6, 0x49, 0xc7, 0x4a, 0xc4, 0x4b, 0xc5, 0x4c, 0xc2, 0x4d, 0xc3, 0x4e, 0xc0, 0x4f, 0xc1, 0x50, 0xde, 0x51, 0xdf, 0x52, 0xdc, 0x53, 0xdd, 0x54, 0xda, 0x55, 0xdb, 0x56, 0xd8, 0x57, 0xd9, 0x58, 0xd6, 0x59, 0xd7, 0x5a, 0xd4, 0x5b, 0xd5, 0x5c, 0xd2, 0x5d, 0xd3, 0x5e, 0xd0, 0x5f, 0xd1, 0x60, 0xee, 0x61, 0xef, 0x62, 0xec, 0x63, 0xed, 0x64, 0xea, 0x65, 0xeb, 0x66, 0xe8, 0x67, 0xe9, 0x68, 0xe6, 0x69, 0xe7, 0x6a, 0xe4, 0x6b, 0xe5, 0x6c, 0xe2, 0x6d, 0xe3, 0x6e, 0xe0, 0x6f, 0xe1, 0x70, 0xfe, 0x71, 0xff, 0x72, 0xfc, 0x73, 0xfd, 0x74, 0xfa, 0x75, 0xfb, 0x76, 0xf8, 0x77, 0xf9, 0x78, 0xf6, 0x79, 0xf7, 0x7a, 0xf4, 0x7b, 0xf5, 0x7c, 0xf2, 0x7d, 0xf3, 0x7e, 0xf0, 0x7f, 0xf1}, + {0x0, 0x8f, 0x3, 0x8c, 0x6, 0x89, 0x5, 0x8a, 0xc, 0x83, 0xf, 0x80, 0xa, 0x85, 0x9, 0x86, 0x18, 0x97, 0x1b, 0x94, 0x1e, 0x91, 0x1d, 0x92, 0x14, 0x9b, 0x17, 0x98, 0x12, 0x9d, 0x11, 0x9e, 0x30, 0xbf, 0x33, 0xbc, 0x36, 0xb9, 0x35, 0xba, 0x3c, 0xb3, 0x3f, 0xb0, 0x3a, 0xb5, 0x39, 0xb6, 0x28, 0xa7, 0x2b, 0xa4, 0x2e, 0xa1, 0x2d, 0xa2, 0x24, 0xab, 0x27, 0xa8, 0x22, 0xad, 0x21, 0xae, 0x60, 0xef, 0x63, 0xec, 0x66, 0xe9, 0x65, 0xea, 0x6c, 0xe3, 0x6f, 0xe0, 0x6a, 0xe5, 0x69, 0xe6, 0x78, 0xf7, 0x7b, 0xf4, 0x7e, 0xf1, 0x7d, 0xf2, 0x74, 0xfb, 0x77, 0xf8, 0x72, 0xfd, 0x71, 0xfe, 0x50, 0xdf, 0x53, 0xdc, 0x56, 0xd9, 0x55, 0xda, 0x5c, 0xd3, 0x5f, 0xd0, 0x5a, 0xd5, 0x59, 0xd6, 0x48, 0xc7, 0x4b, 0xc4, 0x4e, 0xc1, 0x4d, 0xc2, 0x44, 0xcb, 0x47, 0xc8, 0x42, 0xcd, 0x41, 0xce, 0xc0, 0x4f, 0xc3, 0x4c, 0xc6, 0x49, 0xc5, 0x4a, 0xcc, 0x43, 0xcf, 0x40, 0xca, 0x45, 0xc9, 0x46, 0xd8, 0x57, 0xdb, 0x54, 0xde, 0x51, 0xdd, 0x52, 0xd4, 0x5b, 0xd7, 0x58, 0xd2, 0x5d, 0xd1, 0x5e, 0xf0, 0x7f, 0xf3, 0x7c, 0xf6, 0x79, 0xf5, 0x7a, 0xfc, 0x73, 0xff, 0x70, 0xfa, 0x75, 0xf9, 0x76, 0xe8, 0x67, 0xeb, 0x64, 0xee, 0x61, 0xed, 0x62, 0xe4, 0x6b, 0xe7, 0x68, 0xe2, 0x6d, 0xe1, 0x6e, 0xa0, 0x2f, 0xa3, 0x2c, 0xa6, 0x29, 0xa5, 0x2a, 0xac, 0x23, 0xaf, 0x20, 0xaa, 0x25, 0xa9, 0x26, 0xb8, 0x37, 0xbb, 0x34, 0xbe, 0x31, 0xbd, 0x32, 0xb4, 0x3b, 0xb7, 0x38, 0xb2, 0x3d, 0xb1, 0x3e, 0x90, 0x1f, 0x93, 0x1c, 0x96, 0x19, 0x95, 0x1a, 0x9c, 0x13, 0x9f, 0x10, 0x9a, 0x15, 0x99, 0x16, 0x88, 0x7, 0x8b, 0x4, 0x8e, 0x1, 0x8d, 0x2, 0x84, 0xb, 0x87, 0x8, 0x82, 0xd, 0x81, 0xe}, + {0x0, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23, 0xf5, 0x65, 0xc8, 0x58, 0x8f, 0x1f, 0xb2, 0x22, 0x1, 0x91, 0x3c, 0xac, 0x7b, 0xeb, 0x46, 0xd6, 0xf7, 0x67, 0xca, 0x5a, 0x8d, 0x1d, 0xb0, 0x20, 0x3, 0x93, 0x3e, 0xae, 0x79, 0xe9, 0x44, 0xd4, 0x2, 0x92, 0x3f, 0xaf, 0x78, 0xe8, 0x45, 0xd5, 0xf6, 0x66, 0xcb, 0x5b, 0x8c, 0x1c, 0xb1, 0x21, 0xf3, 0x63, 0xce, 0x5e, 0x89, 0x19, 0xb4, 0x24, 0x7, 0x97, 0x3a, 0xaa, 0x7d, 0xed, 0x40, 0xd0, 0x6, 0x96, 0x3b, 0xab, 0x7c, 0xec, 0x41, 0xd1, 0xf2, 0x62, 0xcf, 0x5f, 0x88, 0x18, 0xb5, 0x25, 0x4, 0x94, 0x39, 0xa9, 0x7e, 0xee, 0x43, 0xd3, 0xf0, 0x60, 0xcd, 0x5d, 0x8a, 0x1a, 0xb7, 0x27, 0xf1, 0x61, 0xcc, 0x5c, 0x8b, 0x1b, 0xb6, 0x26, 0x5, 0x95, 0x38, 0xa8, 0x7f, 0xef, 0x42, 0xd2, 0xfb, 0x6b, 0xc6, 0x56, 0x81, 0x11, 0xbc, 0x2c, 0xf, 0x9f, 0x32, 0xa2, 0x75, 0xe5, 0x48, 0xd8, 0xe, 0x9e, 0x33, 0xa3, 0x74, 0xe4, 0x49, 0xd9, 0xfa, 0x6a, 0xc7, 0x57, 0x80, 0x10, 0xbd, 0x2d, 0xc, 0x9c, 0x31, 0xa1, 0x76, 0xe6, 0x4b, 0xdb, 0xf8, 0x68, 0xc5, 0x55, 0x82, 0x12, 0xbf, 0x2f, 0xf9, 0x69, 0xc4, 0x54, 0x83, 0x13, 0xbe, 0x2e, 0xd, 0x9d, 0x30, 0xa0, 0x77, 0xe7, 0x4a, 0xda, 0x8, 0x98, 0x35, 0xa5, 0x72, 0xe2, 0x4f, 0xdf, 0xfc, 0x6c, 0xc1, 0x51, 0x86, 0x16, 0xbb, 0x2b, 0xfd, 0x6d, 0xc0, 0x50, 0x87, 0x17, 0xba, 0x2a, 0x9, 0x99, 0x34, 0xa4, 0x73, 0xe3, 0x4e, 0xde, 0xff, 0x6f, 0xc2, 0x52, 0x85, 0x15, 0xb8, 0x28, 0xb, 0x9b, 0x36, 0xa6, 0x71, 0xe1, 0x4c, 0xdc, 0xa, 0x9a, 0x37, 0xa7, 0x70, 0xe0, 0x4d, 0xdd, 0xfe, 0x6e, 0xc3, 0x53, 0x84, 0x14, 0xb9, 0x29}, + {0x0, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c, 0xe5, 0x74, 0xda, 0x4b, 0x9b, 0xa, 0xa4, 0x35, 0x19, 0x88, 0x26, 0xb7, 0x67, 0xf6, 0x58, 0xc9, 0xd7, 0x46, 0xe8, 0x79, 0xa9, 0x38, 0x96, 0x7, 0x2b, 0xba, 0x14, 0x85, 0x55, 0xc4, 0x6a, 0xfb, 0x32, 0xa3, 0xd, 0x9c, 0x4c, 0xdd, 0x73, 0xe2, 0xce, 0x5f, 0xf1, 0x60, 0xb0, 0x21, 0x8f, 0x1e, 0xb3, 0x22, 0x8c, 0x1d, 0xcd, 0x5c, 0xf2, 0x63, 0x4f, 0xde, 0x70, 0xe1, 0x31, 0xa0, 0xe, 0x9f, 0x56, 0xc7, 0x69, 0xf8, 0x28, 0xb9, 0x17, 0x86, 0xaa, 0x3b, 0x95, 0x4, 0xd4, 0x45, 0xeb, 0x7a, 0x64, 0xf5, 0x5b, 0xca, 0x1a, 0x8b, 0x25, 0xb4, 0x98, 0x9, 0xa7, 0x36, 0xe6, 0x77, 0xd9, 0x48, 0x81, 0x10, 0xbe, 0x2f, 0xff, 0x6e, 0xc0, 0x51, 0x7d, 0xec, 0x42, 0xd3, 0x3, 0x92, 0x3c, 0xad, 0x7b, 0xea, 0x44, 0xd5, 0x5, 0x94, 0x3a, 0xab, 0x87, 0x16, 0xb8, 0x29, 0xf9, 0x68, 0xc6, 0x57, 0x9e, 0xf, 0xa1, 0x30, 0xe0, 0x71, 0xdf, 0x4e, 0x62, 0xf3, 0x5d, 0xcc, 0x1c, 0x8d, 0x23, 0xb2, 0xac, 0x3d, 0x93, 0x2, 0xd2, 0x43, 0xed, 0x7c, 0x50, 0xc1, 0x6f, 0xfe, 0x2e, 0xbf, 0x11, 0x80, 0x49, 0xd8, 0x76, 0xe7, 0x37, 0xa6, 0x8, 0x99, 0xb5, 0x24, 0x8a, 0x1b, 0xcb, 0x5a, 0xf4, 0x65, 0xc8, 0x59, 0xf7, 0x66, 0xb6, 0x27, 0x89, 0x18, 0x34, 0xa5, 0xb, 0x9a, 0x4a, 0xdb, 0x75, 0xe4, 0x2d, 0xbc, 0x12, 0x83, 0x53, 0xc2, 0x6c, 0xfd, 0xd1, 0x40, 0xee, 0x7f, 0xaf, 0x3e, 0x90, 0x1, 0x1f, 0x8e, 0x20, 0xb1, 0x61, 0xf0, 0x5e, 0xcf, 0xe3, 0x72, 0xdc, 0x4d, 0x9d, 0xc, 0xa2, 0x33, 0xfa, 0x6b, 0xc5, 0x54, 0x84, 0x15, 0xbb, 0x2a, 0x6, 0x97, 0x39, 0xa8, 0x78, 0xe9, 0x47, 0xd6}, + {0x0, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x4, 0xaf, 0x3d, 0xd5, 0x47, 0xec, 0x7e, 0xa7, 0x35, 0x9e, 0xc, 0x31, 0xa3, 0x8, 0x9a, 0x43, 0xd1, 0x7a, 0xe8, 0xb7, 0x25, 0x8e, 0x1c, 0xc5, 0x57, 0xfc, 0x6e, 0x53, 0xc1, 0x6a, 0xf8, 0x21, 0xb3, 0x18, 0x8a, 0x62, 0xf0, 0x5b, 0xc9, 0x10, 0x82, 0x29, 0xbb, 0x86, 0x14, 0xbf, 0x2d, 0xf4, 0x66, 0xcd, 0x5f, 0x73, 0xe1, 0x4a, 0xd8, 0x1, 0x93, 0x38, 0xaa, 0x97, 0x5, 0xae, 0x3c, 0xe5, 0x77, 0xdc, 0x4e, 0xa6, 0x34, 0x9f, 0xd, 0xd4, 0x46, 0xed, 0x7f, 0x42, 0xd0, 0x7b, 0xe9, 0x30, 0xa2, 0x9, 0x9b, 0xc4, 0x56, 0xfd, 0x6f, 0xb6, 0x24, 0x8f, 0x1d, 0x20, 0xb2, 0x19, 0x8b, 0x52, 0xc0, 0x6b, 0xf9, 0x11, 0x83, 0x28, 0xba, 0x63, 0xf1, 0x5a, 0xc8, 0xf5, 0x67, 0xcc, 0x5e, 0x87, 0x15, 0xbe, 0x2c, 0xe6, 0x74, 0xdf, 0x4d, 0x94, 0x6, 0xad, 0x3f, 0x2, 0x90, 0x3b, 0xa9, 0x70, 0xe2, 0x49, 0xdb, 0x33, 0xa1, 0xa, 0x98, 0x41, 0xd3, 0x78, 0xea, 0xd7, 0x45, 0xee, 0x7c, 0xa5, 0x37, 0x9c, 0xe, 0x51, 0xc3, 0x68, 0xfa, 0x23, 0xb1, 0x1a, 0x88, 0xb5, 0x27, 0x8c, 0x1e, 0xc7, 0x55, 0xfe, 0x6c, 0x84, 0x16, 0xbd, 0x2f, 0xf6, 0x64, 0xcf, 0x5d, 0x60, 0xf2, 0x59, 0xcb, 0x12, 0x80, 0x2b, 0xb9, 0x95, 0x7, 0xac, 0x3e, 0xe7, 0x75, 0xde, 0x4c, 0x71, 0xe3, 0x48, 0xda, 0x3, 0x91, 0x3a, 0xa8, 0x40, 0xd2, 0x79, 0xeb, 0x32, 0xa0, 0xb, 0x99, 0xa4, 0x36, 0x9d, 0xf, 0xd6, 0x44, 0xef, 0x7d, 0x22, 0xb0, 0x1b, 0x89, 0x50, 0xc2, 0x69, 0xfb, 0xc6, 0x54, 0xff, 0x6d, 0xb4, 0x26, 0x8d, 0x1f, 0xf7, 0x65, 0xce, 0x5c, 0x85, 0x17, 0xbc, 0x2e, 0x13, 0x81, 0x2a, 0xb8, 0x61, 0xf3, 0x58, 0xca}, + {0x0, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x9, 0xa1, 0x32, 0xc5, 0x56, 0xfe, 0x6d, 0xb3, 0x20, 0x88, 0x1b, 0x29, 0xba, 0x12, 0x81, 0x5f, 0xcc, 0x64, 0xf7, 0x97, 0x4, 0xac, 0x3f, 0xe1, 0x72, 0xda, 0x49, 0x7b, 0xe8, 0x40, 0xd3, 0xd, 0x9e, 0x36, 0xa5, 0x52, 0xc1, 0x69, 0xfa, 0x24, 0xb7, 0x1f, 0x8c, 0xbe, 0x2d, 0x85, 0x16, 0xc8, 0x5b, 0xf3, 0x60, 0x33, 0xa0, 0x8, 0x9b, 0x45, 0xd6, 0x7e, 0xed, 0xdf, 0x4c, 0xe4, 0x77, 0xa9, 0x3a, 0x92, 0x1, 0xf6, 0x65, 0xcd, 0x5e, 0x80, 0x13, 0xbb, 0x28, 0x1a, 0x89, 0x21, 0xb2, 0x6c, 0xff, 0x57, 0xc4, 0xa4, 0x37, 0x9f, 0xc, 0xd2, 0x41, 0xe9, 0x7a, 0x48, 0xdb, 0x73, 0xe0, 0x3e, 0xad, 0x5, 0x96, 0x61, 0xf2, 0x5a, 0xc9, 0x17, 0x84, 0x2c, 0xbf, 0x8d, 0x1e, 0xb6, 0x25, 0xfb, 0x68, 0xc0, 0x53, 0x66, 0xf5, 0x5d, 0xce, 0x10, 0x83, 0x2b, 0xb8, 0x8a, 0x19, 0xb1, 0x22, 0xfc, 0x6f, 0xc7, 0x54, 0xa3, 0x30, 0x98, 0xb, 0xd5, 0x46, 0xee, 0x7d, 0x4f, 0xdc, 0x74, 0xe7, 0x39, 0xaa, 0x2, 0x91, 0xf1, 0x62, 0xca, 0x59, 0x87, 0x14, 0xbc, 0x2f, 0x1d, 0x8e, 0x26, 0xb5, 0x6b, 0xf8, 0x50, 0xc3, 0x34, 0xa7, 0xf, 0x9c, 0x42, 0xd1, 0x79, 0xea, 0xd8, 0x4b, 0xe3, 0x70, 0xae, 0x3d, 0x95, 0x6, 0x55, 0xc6, 0x6e, 0xfd, 0x23, 0xb0, 0x18, 0x8b, 0xb9, 0x2a, 0x82, 0x11, 0xcf, 0x5c, 0xf4, 0x67, 0x90, 0x3, 0xab, 0x38, 0xe6, 0x75, 0xdd, 0x4e, 0x7c, 0xef, 0x47, 0xd4, 0xa, 0x99, 0x31, 0xa2, 0xc2, 0x51, 0xf9, 0x6a, 0xb4, 0x27, 0x8f, 0x1c, 0x2e, 0xbd, 0x15, 0x86, 0x58, 0xcb, 0x63, 0xf0, 0x7, 0x94, 0x3c, 0xaf, 0x71, 0xe2, 0x4a, 0xd9, 0xeb, 0x78, 0xd0, 0x43, 0x9d, 0xe, 0xa6, 0x35}, + {0x0, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f, 0xb5, 0x21, 0x80, 0x14, 0xdf, 0x4b, 0xea, 0x7e, 0x61, 0xf5, 0x54, 0xc0, 0xb, 0x9f, 0x3e, 0xaa, 0x77, 0xe3, 0x42, 0xd6, 0x1d, 0x89, 0x28, 0xbc, 0xa3, 0x37, 0x96, 0x2, 0xc9, 0x5d, 0xfc, 0x68, 0xc2, 0x56, 0xf7, 0x63, 0xa8, 0x3c, 0x9d, 0x9, 0x16, 0x82, 0x23, 0xb7, 0x7c, 0xe8, 0x49, 0xdd, 0xee, 0x7a, 0xdb, 0x4f, 0x84, 0x10, 0xb1, 0x25, 0x3a, 0xae, 0xf, 0x9b, 0x50, 0xc4, 0x65, 0xf1, 0x5b, 0xcf, 0x6e, 0xfa, 0x31, 0xa5, 0x4, 0x90, 0x8f, 0x1b, 0xba, 0x2e, 0xe5, 0x71, 0xd0, 0x44, 0x99, 0xd, 0xac, 0x38, 0xf3, 0x67, 0xc6, 0x52, 0x4d, 0xd9, 0x78, 0xec, 0x27, 0xb3, 0x12, 0x86, 0x2c, 0xb8, 0x19, 0x8d, 0x46, 0xd2, 0x73, 0xe7, 0xf8, 0x6c, 0xcd, 0x59, 0x92, 0x6, 0xa7, 0x33, 0xc1, 0x55, 0xf4, 0x60, 0xab, 0x3f, 0x9e, 0xa, 0x15, 0x81, 0x20, 0xb4, 0x7f, 0xeb, 0x4a, 0xde, 0x74, 0xe0, 0x41, 0xd5, 0x1e, 0x8a, 0x2b, 0xbf, 0xa0, 0x34, 0x95, 0x1, 0xca, 0x5e, 0xff, 0x6b, 0xb6, 0x22, 0x83, 0x17, 0xdc, 0x48, 0xe9, 0x7d, 0x62, 0xf6, 0x57, 0xc3, 0x8, 0x9c, 0x3d, 0xa9, 0x3, 0x97, 0x36, 0xa2, 0x69, 0xfd, 0x5c, 0xc8, 0xd7, 0x43, 0xe2, 0x76, 0xbd, 0x29, 0x88, 0x1c, 0x2f, 0xbb, 0x1a, 0x8e, 0x45, 0xd1, 0x70, 0xe4, 0xfb, 0x6f, 0xce, 0x5a, 0x91, 0x5, 0xa4, 0x30, 0x9a, 0xe, 0xaf, 0x3b, 0xf0, 0x64, 0xc5, 0x51, 0x4e, 0xda, 0x7b, 0xef, 0x24, 0xb0, 0x11, 0x85, 0x58, 0xcc, 0x6d, 0xf9, 0x32, 0xa6, 0x7, 0x93, 0x8c, 0x18, 0xb9, 0x2d, 0xe6, 0x72, 0xd3, 0x47, 0xed, 0x79, 0xd8, 0x4c, 0x87, 0x13, 0xb2, 0x26, 0x39, 0xad, 0xc, 0x98, 0x53, 0xc7, 0x66, 0xf2}, + {0x0, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10, 0xa5, 0x30, 0x92, 0x7, 0xcb, 0x5e, 0xfc, 0x69, 0x79, 0xec, 0x4e, 0xdb, 0x17, 0x82, 0x20, 0xb5, 0x57, 0xc2, 0x60, 0xf5, 0x39, 0xac, 0xe, 0x9b, 0x8b, 0x1e, 0xbc, 0x29, 0xe5, 0x70, 0xd2, 0x47, 0xf2, 0x67, 0xc5, 0x50, 0x9c, 0x9, 0xab, 0x3e, 0x2e, 0xbb, 0x19, 0x8c, 0x40, 0xd5, 0x77, 0xe2, 0xae, 0x3b, 0x99, 0xc, 0xc0, 0x55, 0xf7, 0x62, 0x72, 0xe7, 0x45, 0xd0, 0x1c, 0x89, 0x2b, 0xbe, 0xb, 0x9e, 0x3c, 0xa9, 0x65, 0xf0, 0x52, 0xc7, 0xd7, 0x42, 0xe0, 0x75, 0xb9, 0x2c, 0x8e, 0x1b, 0xf9, 0x6c, 0xce, 0x5b, 0x97, 0x2, 0xa0, 0x35, 0x25, 0xb0, 0x12, 0x87, 0x4b, 0xde, 0x7c, 0xe9, 0x5c, 0xc9, 0x6b, 0xfe, 0x32, 0xa7, 0x5, 0x90, 0x80, 0x15, 0xb7, 0x22, 0xee, 0x7b, 0xd9, 0x4c, 0x41, 0xd4, 0x76, 0xe3, 0x2f, 0xba, 0x18, 0x8d, 0x9d, 0x8, 0xaa, 0x3f, 0xf3, 0x66, 0xc4, 0x51, 0xe4, 0x71, 0xd3, 0x46, 0x8a, 0x1f, 0xbd, 0x28, 0x38, 0xad, 0xf, 0x9a, 0x56, 0xc3, 0x61, 0xf4, 0x16, 0x83, 0x21, 0xb4, 0x78, 0xed, 0x4f, 0xda, 0xca, 0x5f, 0xfd, 0x68, 0xa4, 0x31, 0x93, 0x6, 0xb3, 0x26, 0x84, 0x11, 0xdd, 0x48, 0xea, 0x7f, 0x6f, 0xfa, 0x58, 0xcd, 0x1, 0x94, 0x36, 0xa3, 0xef, 0x7a, 0xd8, 0x4d, 0x81, 0x14, 0xb6, 0x23, 0x33, 0xa6, 0x4, 0x91, 0x5d, 0xc8, 0x6a, 0xff, 0x4a, 0xdf, 0x7d, 0xe8, 0x24, 0xb1, 0x13, 0x86, 0x96, 0x3, 0xa1, 0x34, 0xf8, 0x6d, 0xcf, 0x5a, 0xb8, 0x2d, 0x8f, 0x1a, 0xd6, 0x43, 0xe1, 0x74, 0x64, 0xf1, 0x53, 0xc6, 0xa, 0x9f, 0x3d, 0xa8, 0x1d, 0x88, 0x2a, 0xbf, 0x73, 0xe6, 0x44, 0xd1, 0xc1, 0x54, 0xf6, 0x63, 0xaf, 0x3a, 0x98, 0xd}, + {0x0, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x1, 0x95, 0x3, 0xa4, 0x32, 0xf7, 0x61, 0xc6, 0x50, 0x51, 0xc7, 0x60, 0xf6, 0x33, 0xa5, 0x2, 0x94, 0x37, 0xa1, 0x6, 0x90, 0x55, 0xc3, 0x64, 0xf2, 0xf3, 0x65, 0xc2, 0x54, 0x91, 0x7, 0xa0, 0x36, 0xa2, 0x34, 0x93, 0x5, 0xc0, 0x56, 0xf1, 0x67, 0x66, 0xf0, 0x57, 0xc1, 0x4, 0x92, 0x35, 0xa3, 0x6e, 0xf8, 0x5f, 0xc9, 0xc, 0x9a, 0x3d, 0xab, 0xaa, 0x3c, 0x9b, 0xd, 0xc8, 0x5e, 0xf9, 0x6f, 0xfb, 0x6d, 0xca, 0x5c, 0x99, 0xf, 0xa8, 0x3e, 0x3f, 0xa9, 0xe, 0x98, 0x5d, 0xcb, 0x6c, 0xfa, 0x59, 0xcf, 0x68, 0xfe, 0x3b, 0xad, 0xa, 0x9c, 0x9d, 0xb, 0xac, 0x3a, 0xff, 0x69, 0xce, 0x58, 0xcc, 0x5a, 0xfd, 0x6b, 0xae, 0x38, 0x9f, 0x9, 0x8, 0x9e, 0x39, 0xaf, 0x6a, 0xfc, 0x5b, 0xcd, 0xdc, 0x4a, 0xed, 0x7b, 0xbe, 0x28, 0x8f, 0x19, 0x18, 0x8e, 0x29, 0xbf, 0x7a, 0xec, 0x4b, 0xdd, 0x49, 0xdf, 0x78, 0xee, 0x2b, 0xbd, 0x1a, 0x8c, 0x8d, 0x1b, 0xbc, 0x2a, 0xef, 0x79, 0xde, 0x48, 0xeb, 0x7d, 0xda, 0x4c, 0x89, 0x1f, 0xb8, 0x2e, 0x2f, 0xb9, 0x1e, 0x88, 0x4d, 0xdb, 0x7c, 0xea, 0x7e, 0xe8, 0x4f, 0xd9, 0x1c, 0x8a, 0x2d, 0xbb, 0xba, 0x2c, 0x8b, 0x1d, 0xd8, 0x4e, 0xe9, 0x7f, 0xb2, 0x24, 0x83, 0x15, 0xd0, 0x46, 0xe1, 0x77, 0x76, 0xe0, 0x47, 0xd1, 0x14, 0x82, 0x25, 0xb3, 0x27, 0xb1, 0x16, 0x80, 0x45, 0xd3, 0x74, 0xe2, 0xe3, 0x75, 0xd2, 0x44, 0x81, 0x17, 0xb0, 0x26, 0x85, 0x13, 0xb4, 0x22, 0xe7, 0x71, 0xd6, 0x40, 0x41, 0xd7, 0x70, 0xe6, 0x23, 0xb5, 0x12, 0x84, 0x10, 0x86, 0x21, 0xb7, 0x72, 0xe4, 0x43, 0xd5, 0xd4, 0x42, 0xe5, 0x73, 0xb6, 0x20, 0x87, 0x11}, + {0x0, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0xe, 0x85, 0x12, 0xb6, 0x21, 0xe3, 0x74, 0xd0, 0x47, 0x49, 0xde, 0x7a, 0xed, 0x2f, 0xb8, 0x1c, 0x8b, 0x17, 0x80, 0x24, 0xb3, 0x71, 0xe6, 0x42, 0xd5, 0xdb, 0x4c, 0xe8, 0x7f, 0xbd, 0x2a, 0x8e, 0x19, 0x92, 0x5, 0xa1, 0x36, 0xf4, 0x63, 0xc7, 0x50, 0x5e, 0xc9, 0x6d, 0xfa, 0x38, 0xaf, 0xb, 0x9c, 0x2e, 0xb9, 0x1d, 0x8a, 0x48, 0xdf, 0x7b, 0xec, 0xe2, 0x75, 0xd1, 0x46, 0x84, 0x13, 0xb7, 0x20, 0xab, 0x3c, 0x98, 0xf, 0xcd, 0x5a, 0xfe, 0x69, 0x67, 0xf0, 0x54, 0xc3, 0x1, 0x96, 0x32, 0xa5, 0x39, 0xae, 0xa, 0x9d, 0x5f, 0xc8, 0x6c, 0xfb, 0xf5, 0x62, 0xc6, 0x51, 0x93, 0x4, 0xa0, 0x37, 0xbc, 0x2b, 0x8f, 0x18, 0xda, 0x4d, 0xe9, 0x7e, 0x70, 0xe7, 0x43, 0xd4, 0x16, 0x81, 0x25, 0xb2, 0x5c, 0xcb, 0x6f, 0xf8, 0x3a, 0xad, 0x9, 0x9e, 0x90, 0x7, 0xa3, 0x34, 0xf6, 0x61, 0xc5, 0x52, 0xd9, 0x4e, 0xea, 0x7d, 0xbf, 0x28, 0x8c, 0x1b, 0x15, 0x82, 0x26, 0xb1, 0x73, 0xe4, 0x40, 0xd7, 0x4b, 0xdc, 0x78, 0xef, 0x2d, 0xba, 0x1e, 0x89, 0x87, 0x10, 0xb4, 0x23, 0xe1, 0x76, 0xd2, 0x45, 0xce, 0x59, 0xfd, 0x6a, 0xa8, 0x3f, 0x9b, 0xc, 0x2, 0x95, 0x31, 0xa6, 0x64, 0xf3, 0x57, 0xc0, 0x72, 0xe5, 0x41, 0xd6, 0x14, 0x83, 0x27, 0xb0, 0xbe, 0x29, 0x8d, 0x1a, 0xd8, 0x4f, 0xeb, 0x7c, 0xf7, 0x60, 0xc4, 0x53, 0x91, 0x6, 0xa2, 0x35, 0x3b, 0xac, 0x8, 0x9f, 0x5d, 0xca, 0x6e, 0xf9, 0x65, 0xf2, 0x56, 0xc1, 0x3, 0x94, 0x30, 0xa7, 0xa9, 0x3e, 0x9a, 0xd, 0xcf, 0x58, 0xfc, 0x6b, 0xe0, 0x77, 0xd3, 0x44, 0x86, 0x11, 0xb5, 0x22, 0x2c, 0xbb, 0x1f, 0x88, 0x4a, 0xdd, 0x79, 0xee}, + {0x0, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x1, 0xee, 0x76, 0xc3, 0x5b, 0x75, 0xed, 0x58, 0xc0, 0x2f, 0xb7, 0x2, 0x9a, 0xc1, 0x59, 0xec, 0x74, 0x9b, 0x3, 0xb6, 0x2e, 0xea, 0x72, 0xc7, 0x5f, 0xb0, 0x28, 0x9d, 0x5, 0x5e, 0xc6, 0x73, 0xeb, 0x4, 0x9c, 0x29, 0xb1, 0x9f, 0x7, 0xb2, 0x2a, 0xc5, 0x5d, 0xe8, 0x70, 0x2b, 0xb3, 0x6, 0x9e, 0x71, 0xe9, 0x5c, 0xc4, 0xc9, 0x51, 0xe4, 0x7c, 0x93, 0xb, 0xbe, 0x26, 0x7d, 0xe5, 0x50, 0xc8, 0x27, 0xbf, 0xa, 0x92, 0xbc, 0x24, 0x91, 0x9, 0xe6, 0x7e, 0xcb, 0x53, 0x8, 0x90, 0x25, 0xbd, 0x52, 0xca, 0x7f, 0xe7, 0x23, 0xbb, 0xe, 0x96, 0x79, 0xe1, 0x54, 0xcc, 0x97, 0xf, 0xba, 0x22, 0xcd, 0x55, 0xe0, 0x78, 0x56, 0xce, 0x7b, 0xe3, 0xc, 0x94, 0x21, 0xb9, 0xe2, 0x7a, 0xcf, 0x57, 0xb8, 0x20, 0x95, 0xd, 0x8f, 0x17, 0xa2, 0x3a, 0xd5, 0x4d, 0xf8, 0x60, 0x3b, 0xa3, 0x16, 0x8e, 0x61, 0xf9, 0x4c, 0xd4, 0xfa, 0x62, 0xd7, 0x4f, 0xa0, 0x38, 0x8d, 0x15, 0x4e, 0xd6, 0x63, 0xfb, 0x14, 0x8c, 0x39, 0xa1, 0x65, 0xfd, 0x48, 0xd0, 0x3f, 0xa7, 0x12, 0x8a, 0xd1, 0x49, 0xfc, 0x64, 0x8b, 0x13, 0xa6, 0x3e, 0x10, 0x88, 0x3d, 0xa5, 0x4a, 0xd2, 0x67, 0xff, 0xa4, 0x3c, 0x89, 0x11, 0xfe, 0x66, 0xd3, 0x4b, 0x46, 0xde, 0x6b, 0xf3, 0x1c, 0x84, 0x31, 0xa9, 0xf2, 0x6a, 0xdf, 0x47, 0xa8, 0x30, 0x85, 0x1d, 0x33, 0xab, 0x1e, 0x86, 0x69, 0xf1, 0x44, 0xdc, 0x87, 0x1f, 0xaa, 0x32, 0xdd, 0x45, 0xf0, 0x68, 0xac, 0x34, 0x81, 0x19, 0xf6, 0x6e, 0xdb, 0x43, 0x18, 0x80, 0x35, 0xad, 0x42, 0xda, 0x6f, 0xf7, 0xd9, 0x41, 0xf4, 0x6c, 0x83, 0x1b, 0xae, 0x36, 0x6d, 0xf5, 0x40, 0xd8, 0x37, 0xaf, 0x1a, 0x82}, + {0x0, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0xa, 0xe2, 0x7b, 0xcd, 0x54, 0x65, 0xfc, 0x4a, 0xd3, 0x3b, 0xa2, 0x14, 0x8d, 0xd9, 0x40, 0xf6, 0x6f, 0x87, 0x1e, 0xa8, 0x31, 0xca, 0x53, 0xe5, 0x7c, 0x94, 0xd, 0xbb, 0x22, 0x76, 0xef, 0x59, 0xc0, 0x28, 0xb1, 0x7, 0x9e, 0xaf, 0x36, 0x80, 0x19, 0xf1, 0x68, 0xde, 0x47, 0x13, 0x8a, 0x3c, 0xa5, 0x4d, 0xd4, 0x62, 0xfb, 0x89, 0x10, 0xa6, 0x3f, 0xd7, 0x4e, 0xf8, 0x61, 0x35, 0xac, 0x1a, 0x83, 0x6b, 0xf2, 0x44, 0xdd, 0xec, 0x75, 0xc3, 0x5a, 0xb2, 0x2b, 0x9d, 0x4, 0x50, 0xc9, 0x7f, 0xe6, 0xe, 0x97, 0x21, 0xb8, 0x43, 0xda, 0x6c, 0xf5, 0x1d, 0x84, 0x32, 0xab, 0xff, 0x66, 0xd0, 0x49, 0xa1, 0x38, 0x8e, 0x17, 0x26, 0xbf, 0x9, 0x90, 0x78, 0xe1, 0x57, 0xce, 0x9a, 0x3, 0xb5, 0x2c, 0xc4, 0x5d, 0xeb, 0x72, 0xf, 0x96, 0x20, 0xb9, 0x51, 0xc8, 0x7e, 0xe7, 0xb3, 0x2a, 0x9c, 0x5, 0xed, 0x74, 0xc2, 0x5b, 0x6a, 0xf3, 0x45, 0xdc, 0x34, 0xad, 0x1b, 0x82, 0xd6, 0x4f, 0xf9, 0x60, 0x88, 0x11, 0xa7, 0x3e, 0xc5, 0x5c, 0xea, 0x73, 0x9b, 0x2, 0xb4, 0x2d, 0x79, 0xe0, 0x56, 0xcf, 0x27, 0xbe, 0x8, 0x91, 0xa0, 0x39, 0x8f, 0x16, 0xfe, 0x67, 0xd1, 0x48, 0x1c, 0x85, 0x33, 0xaa, 0x42, 0xdb, 0x6d, 0xf4, 0x86, 0x1f, 0xa9, 0x30, 0xd8, 0x41, 0xf7, 0x6e, 0x3a, 0xa3, 0x15, 0x8c, 0x64, 0xfd, 0x4b, 0xd2, 0xe3, 0x7a, 0xcc, 0x55, 0xbd, 0x24, 0x92, 0xb, 0x5f, 0xc6, 0x70, 0xe9, 0x1, 0x98, 0x2e, 0xb7, 0x4c, 0xd5, 0x63, 0xfa, 0x12, 0x8b, 0x3d, 0xa4, 0xf0, 0x69, 0xdf, 0x46, 0xae, 0x37, 0x81, 0x18, 0x29, 0xb0, 0x6, 0x9f, 0x77, 0xee, 0x58, 0xc1, 0x95, 0xc, 0xba, 0x23, 0xcb, 0x52, 0xe4, 0x7d}, + {0x0, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45, 0x55, 0xcf, 0x7c, 0xe6, 0x7, 0x9d, 0x2e, 0xb4, 0xf1, 0x6b, 0xd8, 0x42, 0xa3, 0x39, 0x8a, 0x10, 0xaa, 0x30, 0x83, 0x19, 0xf8, 0x62, 0xd1, 0x4b, 0xe, 0x94, 0x27, 0xbd, 0x5c, 0xc6, 0x75, 0xef, 0xff, 0x65, 0xd6, 0x4c, 0xad, 0x37, 0x84, 0x1e, 0x5b, 0xc1, 0x72, 0xe8, 0x9, 0x93, 0x20, 0xba, 0x49, 0xd3, 0x60, 0xfa, 0x1b, 0x81, 0x32, 0xa8, 0xed, 0x77, 0xc4, 0x5e, 0xbf, 0x25, 0x96, 0xc, 0x1c, 0x86, 0x35, 0xaf, 0x4e, 0xd4, 0x67, 0xfd, 0xb8, 0x22, 0x91, 0xb, 0xea, 0x70, 0xc3, 0x59, 0xe3, 0x79, 0xca, 0x50, 0xb1, 0x2b, 0x98, 0x2, 0x47, 0xdd, 0x6e, 0xf4, 0x15, 0x8f, 0x3c, 0xa6, 0xb6, 0x2c, 0x9f, 0x5, 0xe4, 0x7e, 0xcd, 0x57, 0x12, 0x88, 0x3b, 0xa1, 0x40, 0xda, 0x69, 0xf3, 0x92, 0x8, 0xbb, 0x21, 0xc0, 0x5a, 0xe9, 0x73, 0x36, 0xac, 0x1f, 0x85, 0x64, 0xfe, 0x4d, 0xd7, 0xc7, 0x5d, 0xee, 0x74, 0x95, 0xf, 0xbc, 0x26, 0x63, 0xf9, 0x4a, 0xd0, 0x31, 0xab, 0x18, 0x82, 0x38, 0xa2, 0x11, 0x8b, 0x6a, 0xf0, 0x43, 0xd9, 0x9c, 0x6, 0xb5, 0x2f, 0xce, 0x54, 0xe7, 0x7d, 0x6d, 0xf7, 0x44, 0xde, 0x3f, 0xa5, 0x16, 0x8c, 0xc9, 0x53, 0xe0, 0x7a, 0x9b, 0x1, 0xb2, 0x28, 0xdb, 0x41, 0xf2, 0x68, 0x89, 0x13, 0xa0, 0x3a, 0x7f, 0xe5, 0x56, 0xcc, 0x2d, 0xb7, 0x4, 0x9e, 0x8e, 0x14, 0xa7, 0x3d, 0xdc, 0x46, 0xf5, 0x6f, 0x2a, 0xb0, 0x3, 0x99, 0x78, 0xe2, 0x51, 0xcb, 0x71, 0xeb, 0x58, 0xc2, 0x23, 0xb9, 0xa, 0x90, 0xd5, 0x4f, 0xfc, 0x66, 0x87, 0x1d, 0xae, 0x34, 0x24, 0xbe, 0xd, 0x97, 0x76, 0xec, 0x5f, 0xc5, 0x80, 0x1a, 0xa9, 0x33, 0xd2, 0x48, 0xfb, 0x61}, + {0x0, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a, 0x45, 0xde, 0x6e, 0xf5, 0x13, 0x88, 0x38, 0xa3, 0xe9, 0x72, 0xc2, 0x59, 0xbf, 0x24, 0x94, 0xf, 0x8a, 0x11, 0xa1, 0x3a, 0xdc, 0x47, 0xf7, 0x6c, 0x26, 0xbd, 0xd, 0x96, 0x70, 0xeb, 0x5b, 0xc0, 0xcf, 0x54, 0xe4, 0x7f, 0x99, 0x2, 0xb2, 0x29, 0x63, 0xf8, 0x48, 0xd3, 0x35, 0xae, 0x1e, 0x85, 0x9, 0x92, 0x22, 0xb9, 0x5f, 0xc4, 0x74, 0xef, 0xa5, 0x3e, 0x8e, 0x15, 0xf3, 0x68, 0xd8, 0x43, 0x4c, 0xd7, 0x67, 0xfc, 0x1a, 0x81, 0x31, 0xaa, 0xe0, 0x7b, 0xcb, 0x50, 0xb6, 0x2d, 0x9d, 0x6, 0x83, 0x18, 0xa8, 0x33, 0xd5, 0x4e, 0xfe, 0x65, 0x2f, 0xb4, 0x4, 0x9f, 0x79, 0xe2, 0x52, 0xc9, 0xc6, 0x5d, 0xed, 0x76, 0x90, 0xb, 0xbb, 0x20, 0x6a, 0xf1, 0x41, 0xda, 0x3c, 0xa7, 0x17, 0x8c, 0x12, 0x89, 0x39, 0xa2, 0x44, 0xdf, 0x6f, 0xf4, 0xbe, 0x25, 0x95, 0xe, 0xe8, 0x73, 0xc3, 0x58, 0x57, 0xcc, 0x7c, 0xe7, 0x1, 0x9a, 0x2a, 0xb1, 0xfb, 0x60, 0xd0, 0x4b, 0xad, 0x36, 0x86, 0x1d, 0x98, 0x3, 0xb3, 0x28, 0xce, 0x55, 0xe5, 0x7e, 0x34, 0xaf, 0x1f, 0x84, 0x62, 0xf9, 0x49, 0xd2, 0xdd, 0x46, 0xf6, 0x6d, 0x8b, 0x10, 0xa0, 0x3b, 0x71, 0xea, 0x5a, 0xc1, 0x27, 0xbc, 0xc, 0x97, 0x1b, 0x80, 0x30, 0xab, 0x4d, 0xd6, 0x66, 0xfd, 0xb7, 0x2c, 0x9c, 0x7, 0xe1, 0x7a, 0xca, 0x51, 0x5e, 0xc5, 0x75, 0xee, 0x8, 0x93, 0x23, 0xb8, 0xf2, 0x69, 0xd9, 0x42, 0xa4, 0x3f, 0x8f, 0x14, 0x91, 0xa, 0xba, 0x21, 0xc7, 0x5c, 0xec, 0x77, 0x3d, 0xa6, 0x16, 0x8d, 0x6b, 0xf0, 0x40, 0xdb, 0xd4, 0x4f, 0xff, 0x64, 0x82, 0x19, 0xa9, 0x32, 0x78, 0xe3, 0x53, 0xc8, 0x2e, 0xb5, 0x5, 0x9e}, + {0x0, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x8, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67, 0x35, 0xa9, 0x10, 0x8c, 0x7f, 0xe3, 0x5a, 0xc6, 0xa1, 0x3d, 0x84, 0x18, 0xeb, 0x77, 0xce, 0x52, 0x6a, 0xf6, 0x4f, 0xd3, 0x20, 0xbc, 0x5, 0x99, 0xfe, 0x62, 0xdb, 0x47, 0xb4, 0x28, 0x91, 0xd, 0x5f, 0xc3, 0x7a, 0xe6, 0x15, 0x89, 0x30, 0xac, 0xcb, 0x57, 0xee, 0x72, 0x81, 0x1d, 0xa4, 0x38, 0xd4, 0x48, 0xf1, 0x6d, 0x9e, 0x2, 0xbb, 0x27, 0x40, 0xdc, 0x65, 0xf9, 0xa, 0x96, 0x2f, 0xb3, 0xe1, 0x7d, 0xc4, 0x58, 0xab, 0x37, 0x8e, 0x12, 0x75, 0xe9, 0x50, 0xcc, 0x3f, 0xa3, 0x1a, 0x86, 0xbe, 0x22, 0x9b, 0x7, 0xf4, 0x68, 0xd1, 0x4d, 0x2a, 0xb6, 0xf, 0x93, 0x60, 0xfc, 0x45, 0xd9, 0x8b, 0x17, 0xae, 0x32, 0xc1, 0x5d, 0xe4, 0x78, 0x1f, 0x83, 0x3a, 0xa6, 0x55, 0xc9, 0x70, 0xec, 0xb5, 0x29, 0x90, 0xc, 0xff, 0x63, 0xda, 0x46, 0x21, 0xbd, 0x4, 0x98, 0x6b, 0xf7, 0x4e, 0xd2, 0x80, 0x1c, 0xa5, 0x39, 0xca, 0x56, 0xef, 0x73, 0x14, 0x88, 0x31, 0xad, 0x5e, 0xc2, 0x7b, 0xe7, 0xdf, 0x43, 0xfa, 0x66, 0x95, 0x9, 0xb0, 0x2c, 0x4b, 0xd7, 0x6e, 0xf2, 0x1, 0x9d, 0x24, 0xb8, 0xea, 0x76, 0xcf, 0x53, 0xa0, 0x3c, 0x85, 0x19, 0x7e, 0xe2, 0x5b, 0xc7, 0x34, 0xa8, 0x11, 0x8d, 0x61, 0xfd, 0x44, 0xd8, 0x2b, 0xb7, 0xe, 0x92, 0xf5, 0x69, 0xd0, 0x4c, 0xbf, 0x23, 0x9a, 0x6, 0x54, 0xc8, 0x71, 0xed, 0x1e, 0x82, 0x3b, 0xa7, 0xc0, 0x5c, 0xe5, 0x79, 0x8a, 0x16, 0xaf, 0x33, 0xb, 0x97, 0x2e, 0xb2, 0x41, 0xdd, 0x64, 0xf8, 0x9f, 0x3, 0xba, 0x26, 0xd5, 0x49, 0xf0, 0x6c, 0x3e, 0xa2, 0x1b, 0x87, 0x74, 0xe8, 0x51, 0xcd, 0xaa, 0x36, 0x8f, 0x13, 0xe0, 0x7c, 0xc5, 0x59}, + {0x0, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x1, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68, 0x25, 0xb8, 0x2, 0x9f, 0x6b, 0xf6, 0x4c, 0xd1, 0xb9, 0x24, 0x9e, 0x3, 0xf7, 0x6a, 0xd0, 0x4d, 0x4a, 0xd7, 0x6d, 0xf0, 0x4, 0x99, 0x23, 0xbe, 0xd6, 0x4b, 0xf1, 0x6c, 0x98, 0x5, 0xbf, 0x22, 0x6f, 0xf2, 0x48, 0xd5, 0x21, 0xbc, 0x6, 0x9b, 0xf3, 0x6e, 0xd4, 0x49, 0xbd, 0x20, 0x9a, 0x7, 0x94, 0x9, 0xb3, 0x2e, 0xda, 0x47, 0xfd, 0x60, 0x8, 0x95, 0x2f, 0xb2, 0x46, 0xdb, 0x61, 0xfc, 0xb1, 0x2c, 0x96, 0xb, 0xff, 0x62, 0xd8, 0x45, 0x2d, 0xb0, 0xa, 0x97, 0x63, 0xfe, 0x44, 0xd9, 0xde, 0x43, 0xf9, 0x64, 0x90, 0xd, 0xb7, 0x2a, 0x42, 0xdf, 0x65, 0xf8, 0xc, 0x91, 0x2b, 0xb6, 0xfb, 0x66, 0xdc, 0x41, 0xb5, 0x28, 0x92, 0xf, 0x67, 0xfa, 0x40, 0xdd, 0x29, 0xb4, 0xe, 0x93, 0x35, 0xa8, 0x12, 0x8f, 0x7b, 0xe6, 0x5c, 0xc1, 0xa9, 0x34, 0x8e, 0x13, 0xe7, 0x7a, 0xc0, 0x5d, 0x10, 0x8d, 0x37, 0xaa, 0x5e, 0xc3, 0x79, 0xe4, 0x8c, 0x11, 0xab, 0x36, 0xc2, 0x5f, 0xe5, 0x78, 0x7f, 0xe2, 0x58, 0xc5, 0x31, 0xac, 0x16, 0x8b, 0xe3, 0x7e, 0xc4, 0x59, 0xad, 0x30, 0x8a, 0x17, 0x5a, 0xc7, 0x7d, 0xe0, 0x14, 0x89, 0x33, 0xae, 0xc6, 0x5b, 0xe1, 0x7c, 0x88, 0x15, 0xaf, 0x32, 0xa1, 0x3c, 0x86, 0x1b, 0xef, 0x72, 0xc8, 0x55, 0x3d, 0xa0, 0x1a, 0x87, 0x73, 0xee, 0x54, 0xc9, 0x84, 0x19, 0xa3, 0x3e, 0xca, 0x57, 0xed, 0x70, 0x18, 0x85, 0x3f, 0xa2, 0x56, 0xcb, 0x71, 0xec, 0xeb, 0x76, 0xcc, 0x51, 0xa5, 0x38, 0x82, 0x1f, 0x77, 0xea, 0x50, 0xcd, 0x39, 0xa4, 0x1e, 0x83, 0xce, 0x53, 0xe9, 0x74, 0x80, 0x1d, 0xa7, 0x3a, 0x52, 0xcf, 0x75, 0xe8, 0x1c, 0x81, 0x3b, 0xa6}, + {0x0, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79, 0x15, 0x8b, 0x34, 0xaa, 0x57, 0xc9, 0x76, 0xe8, 0x91, 0xf, 0xb0, 0x2e, 0xd3, 0x4d, 0xf2, 0x6c, 0x2a, 0xb4, 0xb, 0x95, 0x68, 0xf6, 0x49, 0xd7, 0xae, 0x30, 0x8f, 0x11, 0xec, 0x72, 0xcd, 0x53, 0x3f, 0xa1, 0x1e, 0x80, 0x7d, 0xe3, 0x5c, 0xc2, 0xbb, 0x25, 0x9a, 0x4, 0xf9, 0x67, 0xd8, 0x46, 0x54, 0xca, 0x75, 0xeb, 0x16, 0x88, 0x37, 0xa9, 0xd0, 0x4e, 0xf1, 0x6f, 0x92, 0xc, 0xb3, 0x2d, 0x41, 0xdf, 0x60, 0xfe, 0x3, 0x9d, 0x22, 0xbc, 0xc5, 0x5b, 0xe4, 0x7a, 0x87, 0x19, 0xa6, 0x38, 0x7e, 0xe0, 0x5f, 0xc1, 0x3c, 0xa2, 0x1d, 0x83, 0xfa, 0x64, 0xdb, 0x45, 0xb8, 0x26, 0x99, 0x7, 0x6b, 0xf5, 0x4a, 0xd4, 0x29, 0xb7, 0x8, 0x96, 0xef, 0x71, 0xce, 0x50, 0xad, 0x33, 0x8c, 0x12, 0xa8, 0x36, 0x89, 0x17, 0xea, 0x74, 0xcb, 0x55, 0x2c, 0xb2, 0xd, 0x93, 0x6e, 0xf0, 0x4f, 0xd1, 0xbd, 0x23, 0x9c, 0x2, 0xff, 0x61, 0xde, 0x40, 0x39, 0xa7, 0x18, 0x86, 0x7b, 0xe5, 0x5a, 0xc4, 0x82, 0x1c, 0xa3, 0x3d, 0xc0, 0x5e, 0xe1, 0x7f, 0x6, 0x98, 0x27, 0xb9, 0x44, 0xda, 0x65, 0xfb, 0x97, 0x9, 0xb6, 0x28, 0xd5, 0x4b, 0xf4, 0x6a, 0x13, 0x8d, 0x32, 0xac, 0x51, 0xcf, 0x70, 0xee, 0xfc, 0x62, 0xdd, 0x43, 0xbe, 0x20, 0x9f, 0x1, 0x78, 0xe6, 0x59, 0xc7, 0x3a, 0xa4, 0x1b, 0x85, 0xe9, 0x77, 0xc8, 0x56, 0xab, 0x35, 0x8a, 0x14, 0x6d, 0xf3, 0x4c, 0xd2, 0x2f, 0xb1, 0xe, 0x90, 0xd6, 0x48, 0xf7, 0x69, 0x94, 0xa, 0xb5, 0x2b, 0x52, 0xcc, 0x73, 0xed, 0x10, 0x8e, 0x31, 0xaf, 0xc3, 0x5d, 0xe2, 0x7c, 0x81, 0x1f, 0xa0, 0x3e, 0x47, 0xd9, 0x66, 0xf8, 0x5, 0x9b, 0x24, 0xba}, + {0x0, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76, 0x5, 0x9a, 0x26, 0xb9, 0x43, 0xdc, 0x60, 0xff, 0x89, 0x16, 0xaa, 0x35, 0xcf, 0x50, 0xec, 0x73, 0xa, 0x95, 0x29, 0xb6, 0x4c, 0xd3, 0x6f, 0xf0, 0x86, 0x19, 0xa5, 0x3a, 0xc0, 0x5f, 0xe3, 0x7c, 0xf, 0x90, 0x2c, 0xb3, 0x49, 0xd6, 0x6a, 0xf5, 0x83, 0x1c, 0xa0, 0x3f, 0xc5, 0x5a, 0xe6, 0x79, 0x14, 0x8b, 0x37, 0xa8, 0x52, 0xcd, 0x71, 0xee, 0x98, 0x7, 0xbb, 0x24, 0xde, 0x41, 0xfd, 0x62, 0x11, 0x8e, 0x32, 0xad, 0x57, 0xc8, 0x74, 0xeb, 0x9d, 0x2, 0xbe, 0x21, 0xdb, 0x44, 0xf8, 0x67, 0x1e, 0x81, 0x3d, 0xa2, 0x58, 0xc7, 0x7b, 0xe4, 0x92, 0xd, 0xb1, 0x2e, 0xd4, 0x4b, 0xf7, 0x68, 0x1b, 0x84, 0x38, 0xa7, 0x5d, 0xc2, 0x7e, 0xe1, 0x97, 0x8, 0xb4, 0x2b, 0xd1, 0x4e, 0xf2, 0x6d, 0x28, 0xb7, 0xb, 0x94, 0x6e, 0xf1, 0x4d, 0xd2, 0xa4, 0x3b, 0x87, 0x18, 0xe2, 0x7d, 0xc1, 0x5e, 0x2d, 0xb2, 0xe, 0x91, 0x6b, 0xf4, 0x48, 0xd7, 0xa1, 0x3e, 0x82, 0x1d, 0xe7, 0x78, 0xc4, 0x5b, 0x22, 0xbd, 0x1, 0x9e, 0x64, 0xfb, 0x47, 0xd8, 0xae, 0x31, 0x8d, 0x12, 0xe8, 0x77, 0xcb, 0x54, 0x27, 0xb8, 0x4, 0x9b, 0x61, 0xfe, 0x42, 0xdd, 0xab, 0x34, 0x88, 0x17, 0xed, 0x72, 0xce, 0x51, 0x3c, 0xa3, 0x1f, 0x80, 0x7a, 0xe5, 0x59, 0xc6, 0xb0, 0x2f, 0x93, 0xc, 0xf6, 0x69, 0xd5, 0x4a, 0x39, 0xa6, 0x1a, 0x85, 0x7f, 0xe0, 0x5c, 0xc3, 0xb5, 0x2a, 0x96, 0x9, 0xf3, 0x6c, 0xd0, 0x4f, 0x36, 0xa9, 0x15, 0x8a, 0x70, 0xef, 0x53, 0xcc, 0xba, 0x25, 0x99, 0x6, 0xfc, 0x63, 0xdf, 0x40, 0x33, 0xac, 0x10, 0x8f, 0x75, 0xea, 0x56, 0xc9, 0xbf, 0x20, 0x9c, 0x3, 0xf9, 0x66, 0xda, 0x45}, + {0x0, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e, 0xd2, 0x72, 0x8f, 0x2f, 0x68, 0xc8, 0x35, 0x95, 0xbb, 0x1b, 0xe6, 0x46, 0x1, 0xa1, 0x5c, 0xfc, 0xb9, 0x19, 0xe4, 0x44, 0x3, 0xa3, 0x5e, 0xfe, 0xd0, 0x70, 0x8d, 0x2d, 0x6a, 0xca, 0x37, 0x97, 0x6b, 0xcb, 0x36, 0x96, 0xd1, 0x71, 0x8c, 0x2c, 0x2, 0xa2, 0x5f, 0xff, 0xb8, 0x18, 0xe5, 0x45, 0x6f, 0xcf, 0x32, 0x92, 0xd5, 0x75, 0x88, 0x28, 0x6, 0xa6, 0x5b, 0xfb, 0xbc, 0x1c, 0xe1, 0x41, 0xbd, 0x1d, 0xe0, 0x40, 0x7, 0xa7, 0x5a, 0xfa, 0xd4, 0x74, 0x89, 0x29, 0x6e, 0xce, 0x33, 0x93, 0xd6, 0x76, 0x8b, 0x2b, 0x6c, 0xcc, 0x31, 0x91, 0xbf, 0x1f, 0xe2, 0x42, 0x5, 0xa5, 0x58, 0xf8, 0x4, 0xa4, 0x59, 0xf9, 0xbe, 0x1e, 0xe3, 0x43, 0x6d, 0xcd, 0x30, 0x90, 0xd7, 0x77, 0x8a, 0x2a, 0xde, 0x7e, 0x83, 0x23, 0x64, 0xc4, 0x39, 0x99, 0xb7, 0x17, 0xea, 0x4a, 0xd, 0xad, 0x50, 0xf0, 0xc, 0xac, 0x51, 0xf1, 0xb6, 0x16, 0xeb, 0x4b, 0x65, 0xc5, 0x38, 0x98, 0xdf, 0x7f, 0x82, 0x22, 0x67, 0xc7, 0x3a, 0x9a, 0xdd, 0x7d, 0x80, 0x20, 0xe, 0xae, 0x53, 0xf3, 0xb4, 0x14, 0xe9, 0x49, 0xb5, 0x15, 0xe8, 0x48, 0xf, 0xaf, 0x52, 0xf2, 0xdc, 0x7c, 0x81, 0x21, 0x66, 0xc6, 0x3b, 0x9b, 0xb1, 0x11, 0xec, 0x4c, 0xb, 0xab, 0x56, 0xf6, 0xd8, 0x78, 0x85, 0x25, 0x62, 0xc2, 0x3f, 0x9f, 0x63, 0xc3, 0x3e, 0x9e, 0xd9, 0x79, 0x84, 0x24, 0xa, 0xaa, 0x57, 0xf7, 0xb0, 0x10, 0xed, 0x4d, 0x8, 0xa8, 0x55, 0xf5, 0xb2, 0x12, 0xef, 0x4f, 0x61, 0xc1, 0x3c, 0x9c, 0xdb, 0x7b, 0x86, 0x26, 0xda, 0x7a, 0x87, 0x27, 0x60, 0xc0, 0x3d, 0x9d, 0xb3, 0x13, 0xee, 0x4e, 0x9, 0xa9, 0x54, 0xf4}, + {0x0, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21, 0xc2, 0x63, 0x9d, 0x3c, 0x7c, 0xdd, 0x23, 0x82, 0xa3, 0x2, 0xfc, 0x5d, 0x1d, 0xbc, 0x42, 0xe3, 0x99, 0x38, 0xc6, 0x67, 0x27, 0x86, 0x78, 0xd9, 0xf8, 0x59, 0xa7, 0x6, 0x46, 0xe7, 0x19, 0xb8, 0x5b, 0xfa, 0x4, 0xa5, 0xe5, 0x44, 0xba, 0x1b, 0x3a, 0x9b, 0x65, 0xc4, 0x84, 0x25, 0xdb, 0x7a, 0x2f, 0x8e, 0x70, 0xd1, 0x91, 0x30, 0xce, 0x6f, 0x4e, 0xef, 0x11, 0xb0, 0xf0, 0x51, 0xaf, 0xe, 0xed, 0x4c, 0xb2, 0x13, 0x53, 0xf2, 0xc, 0xad, 0x8c, 0x2d, 0xd3, 0x72, 0x32, 0x93, 0x6d, 0xcc, 0xb6, 0x17, 0xe9, 0x48, 0x8, 0xa9, 0x57, 0xf6, 0xd7, 0x76, 0x88, 0x29, 0x69, 0xc8, 0x36, 0x97, 0x74, 0xd5, 0x2b, 0x8a, 0xca, 0x6b, 0x95, 0x34, 0x15, 0xb4, 0x4a, 0xeb, 0xab, 0xa, 0xf4, 0x55, 0x5e, 0xff, 0x1, 0xa0, 0xe0, 0x41, 0xbf, 0x1e, 0x3f, 0x9e, 0x60, 0xc1, 0x81, 0x20, 0xde, 0x7f, 0x9c, 0x3d, 0xc3, 0x62, 0x22, 0x83, 0x7d, 0xdc, 0xfd, 0x5c, 0xa2, 0x3, 0x43, 0xe2, 0x1c, 0xbd, 0xc7, 0x66, 0x98, 0x39, 0x79, 0xd8, 0x26, 0x87, 0xa6, 0x7, 0xf9, 0x58, 0x18, 0xb9, 0x47, 0xe6, 0x5, 0xa4, 0x5a, 0xfb, 0xbb, 0x1a, 0xe4, 0x45, 0x64, 0xc5, 0x3b, 0x9a, 0xda, 0x7b, 0x85, 0x24, 0x71, 0xd0, 0x2e, 0x8f, 0xcf, 0x6e, 0x90, 0x31, 0x10, 0xb1, 0x4f, 0xee, 0xae, 0xf, 0xf1, 0x50, 0xb3, 0x12, 0xec, 0x4d, 0xd, 0xac, 0x52, 0xf3, 0xd2, 0x73, 0x8d, 0x2c, 0x6c, 0xcd, 0x33, 0x92, 0xe8, 0x49, 0xb7, 0x16, 0x56, 0xf7, 0x9, 0xa8, 0x89, 0x28, 0xd6, 0x77, 0x37, 0x96, 0x68, 0xc9, 0x2a, 0x8b, 0x75, 0xd4, 0x94, 0x35, 0xcb, 0x6a, 0x4b, 0xea, 0x14, 0xb5, 0xf5, 0x54, 0xaa, 0xb}, + {0x0, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30, 0xf2, 0x50, 0xab, 0x9, 0x40, 0xe2, 0x19, 0xbb, 0x8b, 0x29, 0xd2, 0x70, 0x39, 0x9b, 0x60, 0xc2, 0xf9, 0x5b, 0xa0, 0x2, 0x4b, 0xe9, 0x12, 0xb0, 0x80, 0x22, 0xd9, 0x7b, 0x32, 0x90, 0x6b, 0xc9, 0xb, 0xa9, 0x52, 0xf0, 0xb9, 0x1b, 0xe0, 0x42, 0x72, 0xd0, 0x2b, 0x89, 0xc0, 0x62, 0x99, 0x3b, 0xef, 0x4d, 0xb6, 0x14, 0x5d, 0xff, 0x4, 0xa6, 0x96, 0x34, 0xcf, 0x6d, 0x24, 0x86, 0x7d, 0xdf, 0x1d, 0xbf, 0x44, 0xe6, 0xaf, 0xd, 0xf6, 0x54, 0x64, 0xc6, 0x3d, 0x9f, 0xd6, 0x74, 0x8f, 0x2d, 0x16, 0xb4, 0x4f, 0xed, 0xa4, 0x6, 0xfd, 0x5f, 0x6f, 0xcd, 0x36, 0x94, 0xdd, 0x7f, 0x84, 0x26, 0xe4, 0x46, 0xbd, 0x1f, 0x56, 0xf4, 0xf, 0xad, 0x9d, 0x3f, 0xc4, 0x66, 0x2f, 0x8d, 0x76, 0xd4, 0xc3, 0x61, 0x9a, 0x38, 0x71, 0xd3, 0x28, 0x8a, 0xba, 0x18, 0xe3, 0x41, 0x8, 0xaa, 0x51, 0xf3, 0x31, 0x93, 0x68, 0xca, 0x83, 0x21, 0xda, 0x78, 0x48, 0xea, 0x11, 0xb3, 0xfa, 0x58, 0xa3, 0x1, 0x3a, 0x98, 0x63, 0xc1, 0x88, 0x2a, 0xd1, 0x73, 0x43, 0xe1, 0x1a, 0xb8, 0xf1, 0x53, 0xa8, 0xa, 0xc8, 0x6a, 0x91, 0x33, 0x7a, 0xd8, 0x23, 0x81, 0xb1, 0x13, 0xe8, 0x4a, 0x3, 0xa1, 0x5a, 0xf8, 0x2c, 0x8e, 0x75, 0xd7, 0x9e, 0x3c, 0xc7, 0x65, 0x55, 0xf7, 0xc, 0xae, 0xe7, 0x45, 0xbe, 0x1c, 0xde, 0x7c, 0x87, 0x25, 0x6c, 0xce, 0x35, 0x97, 0xa7, 0x5, 0xfe, 0x5c, 0x15, 0xb7, 0x4c, 0xee, 0xd5, 0x77, 0x8c, 0x2e, 0x67, 0xc5, 0x3e, 0x9c, 0xac, 0xe, 0xf5, 0x57, 0x1e, 0xbc, 0x47, 0xe5, 0x27, 0x85, 0x7e, 0xdc, 0x95, 0x37, 0xcc, 0x6e, 0x5e, 0xfc, 0x7, 0xa5, 0xec, 0x4e, 0xb5, 0x17}, + {0x0, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f, 0xe2, 0x41, 0xb9, 0x1a, 0x54, 0xf7, 0xf, 0xac, 0x93, 0x30, 0xc8, 0x6b, 0x25, 0x86, 0x7e, 0xdd, 0xd9, 0x7a, 0x82, 0x21, 0x6f, 0xcc, 0x34, 0x97, 0xa8, 0xb, 0xf3, 0x50, 0x1e, 0xbd, 0x45, 0xe6, 0x3b, 0x98, 0x60, 0xc3, 0x8d, 0x2e, 0xd6, 0x75, 0x4a, 0xe9, 0x11, 0xb2, 0xfc, 0x5f, 0xa7, 0x4, 0xaf, 0xc, 0xf4, 0x57, 0x19, 0xba, 0x42, 0xe1, 0xde, 0x7d, 0x85, 0x26, 0x68, 0xcb, 0x33, 0x90, 0x4d, 0xee, 0x16, 0xb5, 0xfb, 0x58, 0xa0, 0x3, 0x3c, 0x9f, 0x67, 0xc4, 0x8a, 0x29, 0xd1, 0x72, 0x76, 0xd5, 0x2d, 0x8e, 0xc0, 0x63, 0x9b, 0x38, 0x7, 0xa4, 0x5c, 0xff, 0xb1, 0x12, 0xea, 0x49, 0x94, 0x37, 0xcf, 0x6c, 0x22, 0x81, 0x79, 0xda, 0xe5, 0x46, 0xbe, 0x1d, 0x53, 0xf0, 0x8, 0xab, 0x43, 0xe0, 0x18, 0xbb, 0xf5, 0x56, 0xae, 0xd, 0x32, 0x91, 0x69, 0xca, 0x84, 0x27, 0xdf, 0x7c, 0xa1, 0x2, 0xfa, 0x59, 0x17, 0xb4, 0x4c, 0xef, 0xd0, 0x73, 0x8b, 0x28, 0x66, 0xc5, 0x3d, 0x9e, 0x9a, 0x39, 0xc1, 0x62, 0x2c, 0x8f, 0x77, 0xd4, 0xeb, 0x48, 0xb0, 0x13, 0x5d, 0xfe, 0x6, 0xa5, 0x78, 0xdb, 0x23, 0x80, 0xce, 0x6d, 0x95, 0x36, 0x9, 0xaa, 0x52, 0xf1, 0xbf, 0x1c, 0xe4, 0x47, 0xec, 0x4f, 0xb7, 0x14, 0x5a, 0xf9, 0x1, 0xa2, 0x9d, 0x3e, 0xc6, 0x65, 0x2b, 0x88, 0x70, 0xd3, 0xe, 0xad, 0x55, 0xf6, 0xb8, 0x1b, 0xe3, 0x40, 0x7f, 0xdc, 0x24, 0x87, 0xc9, 0x6a, 0x92, 0x31, 0x35, 0x96, 0x6e, 0xcd, 0x83, 0x20, 0xd8, 0x7b, 0x44, 0xe7, 0x1f, 0xbc, 0xf2, 0x51, 0xa9, 0xa, 0xd7, 0x74, 0x8c, 0x2f, 0x61, 0xc2, 0x3a, 0x99, 0xa6, 0x5, 0xfd, 0x5e, 0x10, 0xb3, 0x4b, 0xe8}, + {0x0, 0xa4, 0x55, 0xf1, 0xaa, 0xe, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12, 0x92, 0x36, 0xc7, 0x63, 0x38, 0x9c, 0x6d, 0xc9, 0xdb, 0x7f, 0x8e, 0x2a, 0x71, 0xd5, 0x24, 0x80, 0x39, 0x9d, 0x6c, 0xc8, 0x93, 0x37, 0xc6, 0x62, 0x70, 0xd4, 0x25, 0x81, 0xda, 0x7e, 0x8f, 0x2b, 0xab, 0xf, 0xfe, 0x5a, 0x1, 0xa5, 0x54, 0xf0, 0xe2, 0x46, 0xb7, 0x13, 0x48, 0xec, 0x1d, 0xb9, 0x72, 0xd6, 0x27, 0x83, 0xd8, 0x7c, 0x8d, 0x29, 0x3b, 0x9f, 0x6e, 0xca, 0x91, 0x35, 0xc4, 0x60, 0xe0, 0x44, 0xb5, 0x11, 0x4a, 0xee, 0x1f, 0xbb, 0xa9, 0xd, 0xfc, 0x58, 0x3, 0xa7, 0x56, 0xf2, 0x4b, 0xef, 0x1e, 0xba, 0xe1, 0x45, 0xb4, 0x10, 0x2, 0xa6, 0x57, 0xf3, 0xa8, 0xc, 0xfd, 0x59, 0xd9, 0x7d, 0x8c, 0x28, 0x73, 0xd7, 0x26, 0x82, 0x90, 0x34, 0xc5, 0x61, 0x3a, 0x9e, 0x6f, 0xcb, 0xe4, 0x40, 0xb1, 0x15, 0x4e, 0xea, 0x1b, 0xbf, 0xad, 0x9, 0xf8, 0x5c, 0x7, 0xa3, 0x52, 0xf6, 0x76, 0xd2, 0x23, 0x87, 0xdc, 0x78, 0x89, 0x2d, 0x3f, 0x9b, 0x6a, 0xce, 0x95, 0x31, 0xc0, 0x64, 0xdd, 0x79, 0x88, 0x2c, 0x77, 0xd3, 0x22, 0x86, 0x94, 0x30, 0xc1, 0x65, 0x3e, 0x9a, 0x6b, 0xcf, 0x4f, 0xeb, 0x1a, 0xbe, 0xe5, 0x41, 0xb0, 0x14, 0x6, 0xa2, 0x53, 0xf7, 0xac, 0x8, 0xf9, 0x5d, 0x96, 0x32, 0xc3, 0x67, 0x3c, 0x98, 0x69, 0xcd, 0xdf, 0x7b, 0x8a, 0x2e, 0x75, 0xd1, 0x20, 0x84, 0x4, 0xa0, 0x51, 0xf5, 0xae, 0xa, 0xfb, 0x5f, 0x4d, 0xe9, 0x18, 0xbc, 0xe7, 0x43, 0xb2, 0x16, 0xaf, 0xb, 0xfa, 0x5e, 0x5, 0xa1, 0x50, 0xf4, 0xe6, 0x42, 0xb3, 0x17, 0x4c, 0xe8, 0x19, 0xbd, 0x3d, 0x99, 0x68, 0xcc, 0x97, 0x33, 0xc2, 0x66, 0x74, 0xd0, 0x21, 0x85, 0xde, 0x7a, 0x8b, 0x2f}, + {0x0, 0xa5, 0x57, 0xf2, 0xae, 0xb, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d, 0x82, 0x27, 0xd5, 0x70, 0x2c, 0x89, 0x7b, 0xde, 0xc3, 0x66, 0x94, 0x31, 0x6d, 0xc8, 0x3a, 0x9f, 0x19, 0xbc, 0x4e, 0xeb, 0xb7, 0x12, 0xe0, 0x45, 0x58, 0xfd, 0xf, 0xaa, 0xf6, 0x53, 0xa1, 0x4, 0x9b, 0x3e, 0xcc, 0x69, 0x35, 0x90, 0x62, 0xc7, 0xda, 0x7f, 0x8d, 0x28, 0x74, 0xd1, 0x23, 0x86, 0x32, 0x97, 0x65, 0xc0, 0x9c, 0x39, 0xcb, 0x6e, 0x73, 0xd6, 0x24, 0x81, 0xdd, 0x78, 0x8a, 0x2f, 0xb0, 0x15, 0xe7, 0x42, 0x1e, 0xbb, 0x49, 0xec, 0xf1, 0x54, 0xa6, 0x3, 0x5f, 0xfa, 0x8, 0xad, 0x2b, 0x8e, 0x7c, 0xd9, 0x85, 0x20, 0xd2, 0x77, 0x6a, 0xcf, 0x3d, 0x98, 0xc4, 0x61, 0x93, 0x36, 0xa9, 0xc, 0xfe, 0x5b, 0x7, 0xa2, 0x50, 0xf5, 0xe8, 0x4d, 0xbf, 0x1a, 0x46, 0xe3, 0x11, 0xb4, 0x64, 0xc1, 0x33, 0x96, 0xca, 0x6f, 0x9d, 0x38, 0x25, 0x80, 0x72, 0xd7, 0x8b, 0x2e, 0xdc, 0x79, 0xe6, 0x43, 0xb1, 0x14, 0x48, 0xed, 0x1f, 0xba, 0xa7, 0x2, 0xf0, 0x55, 0x9, 0xac, 0x5e, 0xfb, 0x7d, 0xd8, 0x2a, 0x8f, 0xd3, 0x76, 0x84, 0x21, 0x3c, 0x99, 0x6b, 0xce, 0x92, 0x37, 0xc5, 0x60, 0xff, 0x5a, 0xa8, 0xd, 0x51, 0xf4, 0x6, 0xa3, 0xbe, 0x1b, 0xe9, 0x4c, 0x10, 0xb5, 0x47, 0xe2, 0x56, 0xf3, 0x1, 0xa4, 0xf8, 0x5d, 0xaf, 0xa, 0x17, 0xb2, 0x40, 0xe5, 0xb9, 0x1c, 0xee, 0x4b, 0xd4, 0x71, 0x83, 0x26, 0x7a, 0xdf, 0x2d, 0x88, 0x95, 0x30, 0xc2, 0x67, 0x3b, 0x9e, 0x6c, 0xc9, 0x4f, 0xea, 0x18, 0xbd, 0xe1, 0x44, 0xb6, 0x13, 0xe, 0xab, 0x59, 0xfc, 0xa0, 0x5, 0xf7, 0x52, 0xcd, 0x68, 0x9a, 0x3f, 0x63, 0xc6, 0x34, 0x91, 0x8c, 0x29, 0xdb, 0x7e, 0x22, 0x87, 0x75, 0xd0}, + {0x0, 0xa6, 0x51, 0xf7, 0xa2, 0x4, 0xf3, 0x55, 0x59, 0xff, 0x8, 0xae, 0xfb, 0x5d, 0xaa, 0xc, 0xb2, 0x14, 0xe3, 0x45, 0x10, 0xb6, 0x41, 0xe7, 0xeb, 0x4d, 0xba, 0x1c, 0x49, 0xef, 0x18, 0xbe, 0x79, 0xdf, 0x28, 0x8e, 0xdb, 0x7d, 0x8a, 0x2c, 0x20, 0x86, 0x71, 0xd7, 0x82, 0x24, 0xd3, 0x75, 0xcb, 0x6d, 0x9a, 0x3c, 0x69, 0xcf, 0x38, 0x9e, 0x92, 0x34, 0xc3, 0x65, 0x30, 0x96, 0x61, 0xc7, 0xf2, 0x54, 0xa3, 0x5, 0x50, 0xf6, 0x1, 0xa7, 0xab, 0xd, 0xfa, 0x5c, 0x9, 0xaf, 0x58, 0xfe, 0x40, 0xe6, 0x11, 0xb7, 0xe2, 0x44, 0xb3, 0x15, 0x19, 0xbf, 0x48, 0xee, 0xbb, 0x1d, 0xea, 0x4c, 0x8b, 0x2d, 0xda, 0x7c, 0x29, 0x8f, 0x78, 0xde, 0xd2, 0x74, 0x83, 0x25, 0x70, 0xd6, 0x21, 0x87, 0x39, 0x9f, 0x68, 0xce, 0x9b, 0x3d, 0xca, 0x6c, 0x60, 0xc6, 0x31, 0x97, 0xc2, 0x64, 0x93, 0x35, 0xf9, 0x5f, 0xa8, 0xe, 0x5b, 0xfd, 0xa, 0xac, 0xa0, 0x6, 0xf1, 0x57, 0x2, 0xa4, 0x53, 0xf5, 0x4b, 0xed, 0x1a, 0xbc, 0xe9, 0x4f, 0xb8, 0x1e, 0x12, 0xb4, 0x43, 0xe5, 0xb0, 0x16, 0xe1, 0x47, 0x80, 0x26, 0xd1, 0x77, 0x22, 0x84, 0x73, 0xd5, 0xd9, 0x7f, 0x88, 0x2e, 0x7b, 0xdd, 0x2a, 0x8c, 0x32, 0x94, 0x63, 0xc5, 0x90, 0x36, 0xc1, 0x67, 0x6b, 0xcd, 0x3a, 0x9c, 0xc9, 0x6f, 0x98, 0x3e, 0xb, 0xad, 0x5a, 0xfc, 0xa9, 0xf, 0xf8, 0x5e, 0x52, 0xf4, 0x3, 0xa5, 0xf0, 0x56, 0xa1, 0x7, 0xb9, 0x1f, 0xe8, 0x4e, 0x1b, 0xbd, 0x4a, 0xec, 0xe0, 0x46, 0xb1, 0x17, 0x42, 0xe4, 0x13, 0xb5, 0x72, 0xd4, 0x23, 0x85, 0xd0, 0x76, 0x81, 0x27, 0x2b, 0x8d, 0x7a, 0xdc, 0x89, 0x2f, 0xd8, 0x7e, 0xc0, 0x66, 0x91, 0x37, 0x62, 0xc4, 0x33, 0x95, 0x99, 0x3f, 0xc8, 0x6e, 0x3b, 0x9d, 0x6a, 0xcc}, + {0x0, 0xa7, 0x53, 0xf4, 0xa6, 0x1, 0xf5, 0x52, 0x51, 0xf6, 0x2, 0xa5, 0xf7, 0x50, 0xa4, 0x3, 0xa2, 0x5, 0xf1, 0x56, 0x4, 0xa3, 0x57, 0xf0, 0xf3, 0x54, 0xa0, 0x7, 0x55, 0xf2, 0x6, 0xa1, 0x59, 0xfe, 0xa, 0xad, 0xff, 0x58, 0xac, 0xb, 0x8, 0xaf, 0x5b, 0xfc, 0xae, 0x9, 0xfd, 0x5a, 0xfb, 0x5c, 0xa8, 0xf, 0x5d, 0xfa, 0xe, 0xa9, 0xaa, 0xd, 0xf9, 0x5e, 0xc, 0xab, 0x5f, 0xf8, 0xb2, 0x15, 0xe1, 0x46, 0x14, 0xb3, 0x47, 0xe0, 0xe3, 0x44, 0xb0, 0x17, 0x45, 0xe2, 0x16, 0xb1, 0x10, 0xb7, 0x43, 0xe4, 0xb6, 0x11, 0xe5, 0x42, 0x41, 0xe6, 0x12, 0xb5, 0xe7, 0x40, 0xb4, 0x13, 0xeb, 0x4c, 0xb8, 0x1f, 0x4d, 0xea, 0x1e, 0xb9, 0xba, 0x1d, 0xe9, 0x4e, 0x1c, 0xbb, 0x4f, 0xe8, 0x49, 0xee, 0x1a, 0xbd, 0xef, 0x48, 0xbc, 0x1b, 0x18, 0xbf, 0x4b, 0xec, 0xbe, 0x19, 0xed, 0x4a, 0x79, 0xde, 0x2a, 0x8d, 0xdf, 0x78, 0x8c, 0x2b, 0x28, 0x8f, 0x7b, 0xdc, 0x8e, 0x29, 0xdd, 0x7a, 0xdb, 0x7c, 0x88, 0x2f, 0x7d, 0xda, 0x2e, 0x89, 0x8a, 0x2d, 0xd9, 0x7e, 0x2c, 0x8b, 0x7f, 0xd8, 0x20, 0x87, 0x73, 0xd4, 0x86, 0x21, 0xd5, 0x72, 0x71, 0xd6, 0x22, 0x85, 0xd7, 0x70, 0x84, 0x23, 0x82, 0x25, 0xd1, 0x76, 0x24, 0x83, 0x77, 0xd0, 0xd3, 0x74, 0x80, 0x27, 0x75, 0xd2, 0x26, 0x81, 0xcb, 0x6c, 0x98, 0x3f, 0x6d, 0xca, 0x3e, 0x99, 0x9a, 0x3d, 0xc9, 0x6e, 0x3c, 0x9b, 0x6f, 0xc8, 0x69, 0xce, 0x3a, 0x9d, 0xcf, 0x68, 0x9c, 0x3b, 0x38, 0x9f, 0x6b, 0xcc, 0x9e, 0x39, 0xcd, 0x6a, 0x92, 0x35, 0xc1, 0x66, 0x34, 0x93, 0x67, 0xc0, 0xc3, 0x64, 0x90, 0x37, 0x65, 0xc2, 0x36, 0x91, 0x30, 0x97, 0x63, 0xc4, 0x96, 0x31, 0xc5, 0x62, 0x61, 0xc6, 0x32, 0x95, 0xc7, 0x60, 0x94, 0x33}, + {0x0, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56, 0x52, 0xfa, 0x1f, 0xb7, 0xc8, 0x60, 0x85, 0x2d, 0x7b, 0xd3, 0x36, 0x9e, 0xe1, 0x49, 0xac, 0x4, 0xa4, 0xc, 0xe9, 0x41, 0x3e, 0x96, 0x73, 0xdb, 0x8d, 0x25, 0xc0, 0x68, 0x17, 0xbf, 0x5a, 0xf2, 0xf6, 0x5e, 0xbb, 0x13, 0x6c, 0xc4, 0x21, 0x89, 0xdf, 0x77, 0x92, 0x3a, 0x45, 0xed, 0x8, 0xa0, 0x55, 0xfd, 0x18, 0xb0, 0xcf, 0x67, 0x82, 0x2a, 0x7c, 0xd4, 0x31, 0x99, 0xe6, 0x4e, 0xab, 0x3, 0x7, 0xaf, 0x4a, 0xe2, 0x9d, 0x35, 0xd0, 0x78, 0x2e, 0x86, 0x63, 0xcb, 0xb4, 0x1c, 0xf9, 0x51, 0xf1, 0x59, 0xbc, 0x14, 0x6b, 0xc3, 0x26, 0x8e, 0xd8, 0x70, 0x95, 0x3d, 0x42, 0xea, 0xf, 0xa7, 0xa3, 0xb, 0xee, 0x46, 0x39, 0x91, 0x74, 0xdc, 0x8a, 0x22, 0xc7, 0x6f, 0x10, 0xb8, 0x5d, 0xf5, 0xaa, 0x2, 0xe7, 0x4f, 0x30, 0x98, 0x7d, 0xd5, 0x83, 0x2b, 0xce, 0x66, 0x19, 0xb1, 0x54, 0xfc, 0xf8, 0x50, 0xb5, 0x1d, 0x62, 0xca, 0x2f, 0x87, 0xd1, 0x79, 0x9c, 0x34, 0x4b, 0xe3, 0x6, 0xae, 0xe, 0xa6, 0x43, 0xeb, 0x94, 0x3c, 0xd9, 0x71, 0x27, 0x8f, 0x6a, 0xc2, 0xbd, 0x15, 0xf0, 0x58, 0x5c, 0xf4, 0x11, 0xb9, 0xc6, 0x6e, 0x8b, 0x23, 0x75, 0xdd, 0x38, 0x90, 0xef, 0x47, 0xa2, 0xa, 0xff, 0x57, 0xb2, 0x1a, 0x65, 0xcd, 0x28, 0x80, 0xd6, 0x7e, 0x9b, 0x33, 0x4c, 0xe4, 0x1, 0xa9, 0xad, 0x5, 0xe0, 0x48, 0x37, 0x9f, 0x7a, 0xd2, 0x84, 0x2c, 0xc9, 0x61, 0x1e, 0xb6, 0x53, 0xfb, 0x5b, 0xf3, 0x16, 0xbe, 0xc1, 0x69, 0x8c, 0x24, 0x72, 0xda, 0x3f, 0x97, 0xe8, 0x40, 0xa5, 0xd, 0x9, 0xa1, 0x44, 0xec, 0x93, 0x3b, 0xde, 0x76, 0x20, 0x88, 0x6d, 0xc5, 0xba, 0x12, 0xf7, 0x5f}, + {0x0, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59, 0x42, 0xeb, 0xd, 0xa4, 0xdc, 0x75, 0x93, 0x3a, 0x63, 0xca, 0x2c, 0x85, 0xfd, 0x54, 0xb2, 0x1b, 0x84, 0x2d, 0xcb, 0x62, 0x1a, 0xb3, 0x55, 0xfc, 0xa5, 0xc, 0xea, 0x43, 0x3b, 0x92, 0x74, 0xdd, 0xc6, 0x6f, 0x89, 0x20, 0x58, 0xf1, 0x17, 0xbe, 0xe7, 0x4e, 0xa8, 0x1, 0x79, 0xd0, 0x36, 0x9f, 0x15, 0xbc, 0x5a, 0xf3, 0x8b, 0x22, 0xc4, 0x6d, 0x34, 0x9d, 0x7b, 0xd2, 0xaa, 0x3, 0xe5, 0x4c, 0x57, 0xfe, 0x18, 0xb1, 0xc9, 0x60, 0x86, 0x2f, 0x76, 0xdf, 0x39, 0x90, 0xe8, 0x41, 0xa7, 0xe, 0x91, 0x38, 0xde, 0x77, 0xf, 0xa6, 0x40, 0xe9, 0xb0, 0x19, 0xff, 0x56, 0x2e, 0x87, 0x61, 0xc8, 0xd3, 0x7a, 0x9c, 0x35, 0x4d, 0xe4, 0x2, 0xab, 0xf2, 0x5b, 0xbd, 0x14, 0x6c, 0xc5, 0x23, 0x8a, 0x2a, 0x83, 0x65, 0xcc, 0xb4, 0x1d, 0xfb, 0x52, 0xb, 0xa2, 0x44, 0xed, 0x95, 0x3c, 0xda, 0x73, 0x68, 0xc1, 0x27, 0x8e, 0xf6, 0x5f, 0xb9, 0x10, 0x49, 0xe0, 0x6, 0xaf, 0xd7, 0x7e, 0x98, 0x31, 0xae, 0x7, 0xe1, 0x48, 0x30, 0x99, 0x7f, 0xd6, 0x8f, 0x26, 0xc0, 0x69, 0x11, 0xb8, 0x5e, 0xf7, 0xec, 0x45, 0xa3, 0xa, 0x72, 0xdb, 0x3d, 0x94, 0xcd, 0x64, 0x82, 0x2b, 0x53, 0xfa, 0x1c, 0xb5, 0x3f, 0x96, 0x70, 0xd9, 0xa1, 0x8, 0xee, 0x47, 0x1e, 0xb7, 0x51, 0xf8, 0x80, 0x29, 0xcf, 0x66, 0x7d, 0xd4, 0x32, 0x9b, 0xe3, 0x4a, 0xac, 0x5, 0x5c, 0xf5, 0x13, 0xba, 0xc2, 0x6b, 0x8d, 0x24, 0xbb, 0x12, 0xf4, 0x5d, 0x25, 0x8c, 0x6a, 0xc3, 0x9a, 0x33, 0xd5, 0x7c, 0x4, 0xad, 0x4b, 0xe2, 0xf9, 0x50, 0xb6, 0x1f, 0x67, 0xce, 0x28, 0x81, 0xd8, 0x71, 0x97, 0x3e, 0x46, 0xef, 0x9, 0xa0}, + {0x0, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x1, 0xe2, 0x48, 0x72, 0xd8, 0x3b, 0x91, 0xe0, 0x4a, 0xa9, 0x3, 0x4b, 0xe1, 0x2, 0xa8, 0xd9, 0x73, 0x90, 0x3a, 0xe4, 0x4e, 0xad, 0x7, 0x76, 0xdc, 0x3f, 0x95, 0xdd, 0x77, 0x94, 0x3e, 0x4f, 0xe5, 0x6, 0xac, 0x96, 0x3c, 0xdf, 0x75, 0x4, 0xae, 0x4d, 0xe7, 0xaf, 0x5, 0xe6, 0x4c, 0x3d, 0x97, 0x74, 0xde, 0xd5, 0x7f, 0x9c, 0x36, 0x47, 0xed, 0xe, 0xa4, 0xec, 0x46, 0xa5, 0xf, 0x7e, 0xd4, 0x37, 0x9d, 0xa7, 0xd, 0xee, 0x44, 0x35, 0x9f, 0x7c, 0xd6, 0x9e, 0x34, 0xd7, 0x7d, 0xc, 0xa6, 0x45, 0xef, 0x31, 0x9b, 0x78, 0xd2, 0xa3, 0x9, 0xea, 0x40, 0x8, 0xa2, 0x41, 0xeb, 0x9a, 0x30, 0xd3, 0x79, 0x43, 0xe9, 0xa, 0xa0, 0xd1, 0x7b, 0x98, 0x32, 0x7a, 0xd0, 0x33, 0x99, 0xe8, 0x42, 0xa1, 0xb, 0xb7, 0x1d, 0xfe, 0x54, 0x25, 0x8f, 0x6c, 0xc6, 0x8e, 0x24, 0xc7, 0x6d, 0x1c, 0xb6, 0x55, 0xff, 0xc5, 0x6f, 0x8c, 0x26, 0x57, 0xfd, 0x1e, 0xb4, 0xfc, 0x56, 0xb5, 0x1f, 0x6e, 0xc4, 0x27, 0x8d, 0x53, 0xf9, 0x1a, 0xb0, 0xc1, 0x6b, 0x88, 0x22, 0x6a, 0xc0, 0x23, 0x89, 0xf8, 0x52, 0xb1, 0x1b, 0x21, 0x8b, 0x68, 0xc2, 0xb3, 0x19, 0xfa, 0x50, 0x18, 0xb2, 0x51, 0xfb, 0x8a, 0x20, 0xc3, 0x69, 0x62, 0xc8, 0x2b, 0x81, 0xf0, 0x5a, 0xb9, 0x13, 0x5b, 0xf1, 0x12, 0xb8, 0xc9, 0x63, 0x80, 0x2a, 0x10, 0xba, 0x59, 0xf3, 0x82, 0x28, 0xcb, 0x61, 0x29, 0x83, 0x60, 0xca, 0xbb, 0x11, 0xf2, 0x58, 0x86, 0x2c, 0xcf, 0x65, 0x14, 0xbe, 0x5d, 0xf7, 0xbf, 0x15, 0xf6, 0x5c, 0x2d, 0x87, 0x64, 0xce, 0xf4, 0x5e, 0xbd, 0x17, 0x66, 0xcc, 0x2f, 0x85, 0xcd, 0x67, 0x84, 0x2e, 0x5f, 0xf5, 0x16, 0xbc}, + {0x0, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0xc, 0xec, 0x47, 0x62, 0xc9, 0x29, 0x82, 0xf4, 0x5f, 0xbf, 0x14, 0x53, 0xf8, 0x18, 0xb3, 0xc5, 0x6e, 0x8e, 0x25, 0xc4, 0x6f, 0x8f, 0x24, 0x52, 0xf9, 0x19, 0xb2, 0xf5, 0x5e, 0xbe, 0x15, 0x63, 0xc8, 0x28, 0x83, 0xa6, 0xd, 0xed, 0x46, 0x30, 0x9b, 0x7b, 0xd0, 0x97, 0x3c, 0xdc, 0x77, 0x1, 0xaa, 0x4a, 0xe1, 0x95, 0x3e, 0xde, 0x75, 0x3, 0xa8, 0x48, 0xe3, 0xa4, 0xf, 0xef, 0x44, 0x32, 0x99, 0x79, 0xd2, 0xf7, 0x5c, 0xbc, 0x17, 0x61, 0xca, 0x2a, 0x81, 0xc6, 0x6d, 0x8d, 0x26, 0x50, 0xfb, 0x1b, 0xb0, 0x51, 0xfa, 0x1a, 0xb1, 0xc7, 0x6c, 0x8c, 0x27, 0x60, 0xcb, 0x2b, 0x80, 0xf6, 0x5d, 0xbd, 0x16, 0x33, 0x98, 0x78, 0xd3, 0xa5, 0xe, 0xee, 0x45, 0x2, 0xa9, 0x49, 0xe2, 0x94, 0x3f, 0xdf, 0x74, 0x37, 0x9c, 0x7c, 0xd7, 0xa1, 0xa, 0xea, 0x41, 0x6, 0xad, 0x4d, 0xe6, 0x90, 0x3b, 0xdb, 0x70, 0x55, 0xfe, 0x1e, 0xb5, 0xc3, 0x68, 0x88, 0x23, 0x64, 0xcf, 0x2f, 0x84, 0xf2, 0x59, 0xb9, 0x12, 0xf3, 0x58, 0xb8, 0x13, 0x65, 0xce, 0x2e, 0x85, 0xc2, 0x69, 0x89, 0x22, 0x54, 0xff, 0x1f, 0xb4, 0x91, 0x3a, 0xda, 0x71, 0x7, 0xac, 0x4c, 0xe7, 0xa0, 0xb, 0xeb, 0x40, 0x36, 0x9d, 0x7d, 0xd6, 0xa2, 0x9, 0xe9, 0x42, 0x34, 0x9f, 0x7f, 0xd4, 0x93, 0x38, 0xd8, 0x73, 0x5, 0xae, 0x4e, 0xe5, 0xc0, 0x6b, 0x8b, 0x20, 0x56, 0xfd, 0x1d, 0xb6, 0xf1, 0x5a, 0xba, 0x11, 0x67, 0xcc, 0x2c, 0x87, 0x66, 0xcd, 0x2d, 0x86, 0xf0, 0x5b, 0xbb, 0x10, 0x57, 0xfc, 0x1c, 0xb7, 0xc1, 0x6a, 0x8a, 0x21, 0x4, 0xaf, 0x4f, 0xe4, 0x92, 0x39, 0xd9, 0x72, 0x35, 0x9e, 0x7e, 0xd5, 0xa3, 0x8, 0xe8, 0x43}, + {0x0, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x9, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a, 0x12, 0xbe, 0x57, 0xfb, 0x98, 0x34, 0xdd, 0x71, 0x1b, 0xb7, 0x5e, 0xf2, 0x91, 0x3d, 0xd4, 0x78, 0x24, 0x88, 0x61, 0xcd, 0xae, 0x2, 0xeb, 0x47, 0x2d, 0x81, 0x68, 0xc4, 0xa7, 0xb, 0xe2, 0x4e, 0x36, 0x9a, 0x73, 0xdf, 0xbc, 0x10, 0xf9, 0x55, 0x3f, 0x93, 0x7a, 0xd6, 0xb5, 0x19, 0xf0, 0x5c, 0x48, 0xe4, 0xd, 0xa1, 0xc2, 0x6e, 0x87, 0x2b, 0x41, 0xed, 0x4, 0xa8, 0xcb, 0x67, 0x8e, 0x22, 0x5a, 0xf6, 0x1f, 0xb3, 0xd0, 0x7c, 0x95, 0x39, 0x53, 0xff, 0x16, 0xba, 0xd9, 0x75, 0x9c, 0x30, 0x6c, 0xc0, 0x29, 0x85, 0xe6, 0x4a, 0xa3, 0xf, 0x65, 0xc9, 0x20, 0x8c, 0xef, 0x43, 0xaa, 0x6, 0x7e, 0xd2, 0x3b, 0x97, 0xf4, 0x58, 0xb1, 0x1d, 0x77, 0xdb, 0x32, 0x9e, 0xfd, 0x51, 0xb8, 0x14, 0x90, 0x3c, 0xd5, 0x79, 0x1a, 0xb6, 0x5f, 0xf3, 0x99, 0x35, 0xdc, 0x70, 0x13, 0xbf, 0x56, 0xfa, 0x82, 0x2e, 0xc7, 0x6b, 0x8, 0xa4, 0x4d, 0xe1, 0x8b, 0x27, 0xce, 0x62, 0x1, 0xad, 0x44, 0xe8, 0xb4, 0x18, 0xf1, 0x5d, 0x3e, 0x92, 0x7b, 0xd7, 0xbd, 0x11, 0xf8, 0x54, 0x37, 0x9b, 0x72, 0xde, 0xa6, 0xa, 0xe3, 0x4f, 0x2c, 0x80, 0x69, 0xc5, 0xaf, 0x3, 0xea, 0x46, 0x25, 0x89, 0x60, 0xcc, 0xd8, 0x74, 0x9d, 0x31, 0x52, 0xfe, 0x17, 0xbb, 0xd1, 0x7d, 0x94, 0x38, 0x5b, 0xf7, 0x1e, 0xb2, 0xca, 0x66, 0x8f, 0x23, 0x40, 0xec, 0x5, 0xa9, 0xc3, 0x6f, 0x86, 0x2a, 0x49, 0xe5, 0xc, 0xa0, 0xfc, 0x50, 0xb9, 0x15, 0x76, 0xda, 0x33, 0x9f, 0xf5, 0x59, 0xb0, 0x1c, 0x7f, 0xd3, 0x3a, 0x96, 0xee, 0x42, 0xab, 0x7, 0x64, 0xc8, 0x21, 0x8d, 0xe7, 0x4b, 0xa2, 0xe, 0x6d, 0xc1, 0x28, 0x84}, + {0x0, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x1, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65, 0x2, 0xaf, 0x45, 0xe8, 0x8c, 0x21, 0xcb, 0x66, 0x3, 0xae, 0x44, 0xe9, 0x8d, 0x20, 0xca, 0x67, 0x4, 0xa9, 0x43, 0xee, 0x8a, 0x27, 0xcd, 0x60, 0x5, 0xa8, 0x42, 0xef, 0x8b, 0x26, 0xcc, 0x61, 0x6, 0xab, 0x41, 0xec, 0x88, 0x25, 0xcf, 0x62, 0x7, 0xaa, 0x40, 0xed, 0x89, 0x24, 0xce, 0x63, 0x8, 0xa5, 0x4f, 0xe2, 0x86, 0x2b, 0xc1, 0x6c, 0x9, 0xa4, 0x4e, 0xe3, 0x87, 0x2a, 0xc0, 0x6d, 0xa, 0xa7, 0x4d, 0xe0, 0x84, 0x29, 0xc3, 0x6e, 0xb, 0xa6, 0x4c, 0xe1, 0x85, 0x28, 0xc2, 0x6f, 0xc, 0xa1, 0x4b, 0xe6, 0x82, 0x2f, 0xc5, 0x68, 0xd, 0xa0, 0x4a, 0xe7, 0x83, 0x2e, 0xc4, 0x69, 0xe, 0xa3, 0x49, 0xe4, 0x80, 0x2d, 0xc7, 0x6a, 0xf, 0xa2, 0x48, 0xe5, 0x81, 0x2c, 0xc6, 0x6b, 0x10, 0xbd, 0x57, 0xfa, 0x9e, 0x33, 0xd9, 0x74, 0x11, 0xbc, 0x56, 0xfb, 0x9f, 0x32, 0xd8, 0x75, 0x12, 0xbf, 0x55, 0xf8, 0x9c, 0x31, 0xdb, 0x76, 0x13, 0xbe, 0x54, 0xf9, 0x9d, 0x30, 0xda, 0x77, 0x14, 0xb9, 0x53, 0xfe, 0x9a, 0x37, 0xdd, 0x70, 0x15, 0xb8, 0x52, 0xff, 0x9b, 0x36, 0xdc, 0x71, 0x16, 0xbb, 0x51, 0xfc, 0x98, 0x35, 0xdf, 0x72, 0x17, 0xba, 0x50, 0xfd, 0x99, 0x34, 0xde, 0x73, 0x18, 0xb5, 0x5f, 0xf2, 0x96, 0x3b, 0xd1, 0x7c, 0x19, 0xb4, 0x5e, 0xf3, 0x97, 0x3a, 0xd0, 0x7d, 0x1a, 0xb7, 0x5d, 0xf0, 0x94, 0x39, 0xd3, 0x7e, 0x1b, 0xb6, 0x5c, 0xf1, 0x95, 0x38, 0xd2, 0x7f, 0x1c, 0xb1, 0x5b, 0xf6, 0x92, 0x3f, 0xd5, 0x78, 0x1d, 0xb0, 0x5a, 0xf7, 0x93, 0x3e, 0xd4, 0x79, 0x1e, 0xb3, 0x59, 0xf4, 0x90, 0x3d, 0xd7, 0x7a, 0x1f, 0xb2, 0x58, 0xf5, 0x91, 0x3c, 0xd6, 0x7b}, + {0x0, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74, 0x32, 0x9c, 0x73, 0xdd, 0xb0, 0x1e, 0xf1, 0x5f, 0x2b, 0x85, 0x6a, 0xc4, 0xa9, 0x7, 0xe8, 0x46, 0x64, 0xca, 0x25, 0x8b, 0xe6, 0x48, 0xa7, 0x9, 0x7d, 0xd3, 0x3c, 0x92, 0xff, 0x51, 0xbe, 0x10, 0x56, 0xf8, 0x17, 0xb9, 0xd4, 0x7a, 0x95, 0x3b, 0x4f, 0xe1, 0xe, 0xa0, 0xcd, 0x63, 0x8c, 0x22, 0xc8, 0x66, 0x89, 0x27, 0x4a, 0xe4, 0xb, 0xa5, 0xd1, 0x7f, 0x90, 0x3e, 0x53, 0xfd, 0x12, 0xbc, 0xfa, 0x54, 0xbb, 0x15, 0x78, 0xd6, 0x39, 0x97, 0xe3, 0x4d, 0xa2, 0xc, 0x61, 0xcf, 0x20, 0x8e, 0xac, 0x2, 0xed, 0x43, 0x2e, 0x80, 0x6f, 0xc1, 0xb5, 0x1b, 0xf4, 0x5a, 0x37, 0x99, 0x76, 0xd8, 0x9e, 0x30, 0xdf, 0x71, 0x1c, 0xb2, 0x5d, 0xf3, 0x87, 0x29, 0xc6, 0x68, 0x5, 0xab, 0x44, 0xea, 0x8d, 0x23, 0xcc, 0x62, 0xf, 0xa1, 0x4e, 0xe0, 0x94, 0x3a, 0xd5, 0x7b, 0x16, 0xb8, 0x57, 0xf9, 0xbf, 0x11, 0xfe, 0x50, 0x3d, 0x93, 0x7c, 0xd2, 0xa6, 0x8, 0xe7, 0x49, 0x24, 0x8a, 0x65, 0xcb, 0xe9, 0x47, 0xa8, 0x6, 0x6b, 0xc5, 0x2a, 0x84, 0xf0, 0x5e, 0xb1, 0x1f, 0x72, 0xdc, 0x33, 0x9d, 0xdb, 0x75, 0x9a, 0x34, 0x59, 0xf7, 0x18, 0xb6, 0xc2, 0x6c, 0x83, 0x2d, 0x40, 0xee, 0x1, 0xaf, 0x45, 0xeb, 0x4, 0xaa, 0xc7, 0x69, 0x86, 0x28, 0x5c, 0xf2, 0x1d, 0xb3, 0xde, 0x70, 0x9f, 0x31, 0x77, 0xd9, 0x36, 0x98, 0xf5, 0x5b, 0xb4, 0x1a, 0x6e, 0xc0, 0x2f, 0x81, 0xec, 0x42, 0xad, 0x3, 0x21, 0x8f, 0x60, 0xce, 0xa3, 0xd, 0xe2, 0x4c, 0x38, 0x96, 0x79, 0xd7, 0xba, 0x14, 0xfb, 0x55, 0x13, 0xbd, 0x52, 0xfc, 0x91, 0x3f, 0xd0, 0x7e, 0xa, 0xa4, 0x4b, 0xe5, 0x88, 0x26, 0xc9, 0x67}, + {0x0, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b, 0x22, 0x8d, 0x61, 0xce, 0xa4, 0xb, 0xe7, 0x48, 0x33, 0x9c, 0x70, 0xdf, 0xb5, 0x1a, 0xf6, 0x59, 0x44, 0xeb, 0x7, 0xa8, 0xc2, 0x6d, 0x81, 0x2e, 0x55, 0xfa, 0x16, 0xb9, 0xd3, 0x7c, 0x90, 0x3f, 0x66, 0xc9, 0x25, 0x8a, 0xe0, 0x4f, 0xa3, 0xc, 0x77, 0xd8, 0x34, 0x9b, 0xf1, 0x5e, 0xb2, 0x1d, 0x88, 0x27, 0xcb, 0x64, 0xe, 0xa1, 0x4d, 0xe2, 0x99, 0x36, 0xda, 0x75, 0x1f, 0xb0, 0x5c, 0xf3, 0xaa, 0x5, 0xe9, 0x46, 0x2c, 0x83, 0x6f, 0xc0, 0xbb, 0x14, 0xf8, 0x57, 0x3d, 0x92, 0x7e, 0xd1, 0xcc, 0x63, 0x8f, 0x20, 0x4a, 0xe5, 0x9, 0xa6, 0xdd, 0x72, 0x9e, 0x31, 0x5b, 0xf4, 0x18, 0xb7, 0xee, 0x41, 0xad, 0x2, 0x68, 0xc7, 0x2b, 0x84, 0xff, 0x50, 0xbc, 0x13, 0x79, 0xd6, 0x3a, 0x95, 0xd, 0xa2, 0x4e, 0xe1, 0x8b, 0x24, 0xc8, 0x67, 0x1c, 0xb3, 0x5f, 0xf0, 0x9a, 0x35, 0xd9, 0x76, 0x2f, 0x80, 0x6c, 0xc3, 0xa9, 0x6, 0xea, 0x45, 0x3e, 0x91, 0x7d, 0xd2, 0xb8, 0x17, 0xfb, 0x54, 0x49, 0xe6, 0xa, 0xa5, 0xcf, 0x60, 0x8c, 0x23, 0x58, 0xf7, 0x1b, 0xb4, 0xde, 0x71, 0x9d, 0x32, 0x6b, 0xc4, 0x28, 0x87, 0xed, 0x42, 0xae, 0x1, 0x7a, 0xd5, 0x39, 0x96, 0xfc, 0x53, 0xbf, 0x10, 0x85, 0x2a, 0xc6, 0x69, 0x3, 0xac, 0x40, 0xef, 0x94, 0x3b, 0xd7, 0x78, 0x12, 0xbd, 0x51, 0xfe, 0xa7, 0x8, 0xe4, 0x4b, 0x21, 0x8e, 0x62, 0xcd, 0xb6, 0x19, 0xf5, 0x5a, 0x30, 0x9f, 0x73, 0xdc, 0xc1, 0x6e, 0x82, 0x2d, 0x47, 0xe8, 0x4, 0xab, 0xd0, 0x7f, 0x93, 0x3c, 0x56, 0xf9, 0x15, 0xba, 0xe3, 0x4c, 0xa0, 0xf, 0x65, 0xca, 0x26, 0x89, 0xf2, 0x5d, 0xb1, 0x1e, 0x74, 0xdb, 0x37, 0x98}, + {0x0, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde, 0xcf, 0x7f, 0xb2, 0x2, 0x35, 0x85, 0x48, 0xf8, 0x26, 0x96, 0x5b, 0xeb, 0xdc, 0x6c, 0xa1, 0x11, 0x83, 0x33, 0xfe, 0x4e, 0x79, 0xc9, 0x4, 0xb4, 0x6a, 0xda, 0x17, 0xa7, 0x90, 0x20, 0xed, 0x5d, 0x4c, 0xfc, 0x31, 0x81, 0xb6, 0x6, 0xcb, 0x7b, 0xa5, 0x15, 0xd8, 0x68, 0x5f, 0xef, 0x22, 0x92, 0x1b, 0xab, 0x66, 0xd6, 0xe1, 0x51, 0x9c, 0x2c, 0xf2, 0x42, 0x8f, 0x3f, 0x8, 0xb8, 0x75, 0xc5, 0xd4, 0x64, 0xa9, 0x19, 0x2e, 0x9e, 0x53, 0xe3, 0x3d, 0x8d, 0x40, 0xf0, 0xc7, 0x77, 0xba, 0xa, 0x98, 0x28, 0xe5, 0x55, 0x62, 0xd2, 0x1f, 0xaf, 0x71, 0xc1, 0xc, 0xbc, 0x8b, 0x3b, 0xf6, 0x46, 0x57, 0xe7, 0x2a, 0x9a, 0xad, 0x1d, 0xd0, 0x60, 0xbe, 0xe, 0xc3, 0x73, 0x44, 0xf4, 0x39, 0x89, 0x36, 0x86, 0x4b, 0xfb, 0xcc, 0x7c, 0xb1, 0x1, 0xdf, 0x6f, 0xa2, 0x12, 0x25, 0x95, 0x58, 0xe8, 0xf9, 0x49, 0x84, 0x34, 0x3, 0xb3, 0x7e, 0xce, 0x10, 0xa0, 0x6d, 0xdd, 0xea, 0x5a, 0x97, 0x27, 0xb5, 0x5, 0xc8, 0x78, 0x4f, 0xff, 0x32, 0x82, 0x5c, 0xec, 0x21, 0x91, 0xa6, 0x16, 0xdb, 0x6b, 0x7a, 0xca, 0x7, 0xb7, 0x80, 0x30, 0xfd, 0x4d, 0x93, 0x23, 0xee, 0x5e, 0x69, 0xd9, 0x14, 0xa4, 0x2d, 0x9d, 0x50, 0xe0, 0xd7, 0x67, 0xaa, 0x1a, 0xc4, 0x74, 0xb9, 0x9, 0x3e, 0x8e, 0x43, 0xf3, 0xe2, 0x52, 0x9f, 0x2f, 0x18, 0xa8, 0x65, 0xd5, 0xb, 0xbb, 0x76, 0xc6, 0xf1, 0x41, 0x8c, 0x3c, 0xae, 0x1e, 0xd3, 0x63, 0x54, 0xe4, 0x29, 0x99, 0x47, 0xf7, 0x3a, 0x8a, 0xbd, 0xd, 0xc0, 0x70, 0x61, 0xd1, 0x1c, 0xac, 0x9b, 0x2b, 0xe6, 0x56, 0x88, 0x38, 0xf5, 0x45, 0x72, 0xc2, 0xf, 0xbf}, + {0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1, 0xdf, 0x6e, 0xa0, 0x11, 0x21, 0x90, 0x5e, 0xef, 0x3e, 0x8f, 0x41, 0xf0, 0xc0, 0x71, 0xbf, 0xe, 0xa3, 0x12, 0xdc, 0x6d, 0x5d, 0xec, 0x22, 0x93, 0x42, 0xf3, 0x3d, 0x8c, 0xbc, 0xd, 0xc3, 0x72, 0x7c, 0xcd, 0x3, 0xb2, 0x82, 0x33, 0xfd, 0x4c, 0x9d, 0x2c, 0xe2, 0x53, 0x63, 0xd2, 0x1c, 0xad, 0x5b, 0xea, 0x24, 0x95, 0xa5, 0x14, 0xda, 0x6b, 0xba, 0xb, 0xc5, 0x74, 0x44, 0xf5, 0x3b, 0x8a, 0x84, 0x35, 0xfb, 0x4a, 0x7a, 0xcb, 0x5, 0xb4, 0x65, 0xd4, 0x1a, 0xab, 0x9b, 0x2a, 0xe4, 0x55, 0xf8, 0x49, 0x87, 0x36, 0x6, 0xb7, 0x79, 0xc8, 0x19, 0xa8, 0x66, 0xd7, 0xe7, 0x56, 0x98, 0x29, 0x27, 0x96, 0x58, 0xe9, 0xd9, 0x68, 0xa6, 0x17, 0xc6, 0x77, 0xb9, 0x8, 0x38, 0x89, 0x47, 0xf6, 0xb6, 0x7, 0xc9, 0x78, 0x48, 0xf9, 0x37, 0x86, 0x57, 0xe6, 0x28, 0x99, 0xa9, 0x18, 0xd6, 0x67, 0x69, 0xd8, 0x16, 0xa7, 0x97, 0x26, 0xe8, 0x59, 0x88, 0x39, 0xf7, 0x46, 0x76, 0xc7, 0x9, 0xb8, 0x15, 0xa4, 0x6a, 0xdb, 0xeb, 0x5a, 0x94, 0x25, 0xf4, 0x45, 0x8b, 0x3a, 0xa, 0xbb, 0x75, 0xc4, 0xca, 0x7b, 0xb5, 0x4, 0x34, 0x85, 0x4b, 0xfa, 0x2b, 0x9a, 0x54, 0xe5, 0xd5, 0x64, 0xaa, 0x1b, 0xed, 0x5c, 0x92, 0x23, 0x13, 0xa2, 0x6c, 0xdd, 0xc, 0xbd, 0x73, 0xc2, 0xf2, 0x43, 0x8d, 0x3c, 0x32, 0x83, 0x4d, 0xfc, 0xcc, 0x7d, 0xb3, 0x2, 0xd3, 0x62, 0xac, 0x1d, 0x2d, 0x9c, 0x52, 0xe3, 0x4e, 0xff, 0x31, 0x80, 0xb0, 0x1, 0xcf, 0x7e, 0xaf, 0x1e, 0xd0, 0x61, 0x51, 0xe0, 0x2e, 0x9f, 0x91, 0x20, 0xee, 0x5f, 0x6f, 0xde, 0x10, 0xa1, 0x70, 0xc1, 0xf, 0xbe, 0x8e, 0x3f, 0xf1, 0x40}, + {0x0, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0xb, 0xb9, 0x72, 0xc0, 0xef, 0x5d, 0x96, 0x24, 0x1d, 0xaf, 0x64, 0xd6, 0x16, 0xa4, 0x6f, 0xdd, 0xe4, 0x56, 0x9d, 0x2f, 0xc3, 0x71, 0xba, 0x8, 0x31, 0x83, 0x48, 0xfa, 0x3a, 0x88, 0x43, 0xf1, 0xc8, 0x7a, 0xb1, 0x3, 0x2c, 0x9e, 0x55, 0xe7, 0xde, 0x6c, 0xa7, 0x15, 0xd5, 0x67, 0xac, 0x1e, 0x27, 0x95, 0x5e, 0xec, 0x9b, 0x29, 0xe2, 0x50, 0x69, 0xdb, 0x10, 0xa2, 0x62, 0xd0, 0x1b, 0xa9, 0x90, 0x22, 0xe9, 0x5b, 0x74, 0xc6, 0xd, 0xbf, 0x86, 0x34, 0xff, 0x4d, 0x8d, 0x3f, 0xf4, 0x46, 0x7f, 0xcd, 0x6, 0xb4, 0x58, 0xea, 0x21, 0x93, 0xaa, 0x18, 0xd3, 0x61, 0xa1, 0x13, 0xd8, 0x6a, 0x53, 0xe1, 0x2a, 0x98, 0xb7, 0x5, 0xce, 0x7c, 0x45, 0xf7, 0x3c, 0x8e, 0x4e, 0xfc, 0x37, 0x85, 0xbc, 0xe, 0xc5, 0x77, 0x2b, 0x99, 0x52, 0xe0, 0xd9, 0x6b, 0xa0, 0x12, 0xd2, 0x60, 0xab, 0x19, 0x20, 0x92, 0x59, 0xeb, 0xc4, 0x76, 0xbd, 0xf, 0x36, 0x84, 0x4f, 0xfd, 0x3d, 0x8f, 0x44, 0xf6, 0xcf, 0x7d, 0xb6, 0x4, 0xe8, 0x5a, 0x91, 0x23, 0x1a, 0xa8, 0x63, 0xd1, 0x11, 0xa3, 0x68, 0xda, 0xe3, 0x51, 0x9a, 0x28, 0x7, 0xb5, 0x7e, 0xcc, 0xf5, 0x47, 0x8c, 0x3e, 0xfe, 0x4c, 0x87, 0x35, 0xc, 0xbe, 0x75, 0xc7, 0xb0, 0x2, 0xc9, 0x7b, 0x42, 0xf0, 0x3b, 0x89, 0x49, 0xfb, 0x30, 0x82, 0xbb, 0x9, 0xc2, 0x70, 0x5f, 0xed, 0x26, 0x94, 0xad, 0x1f, 0xd4, 0x66, 0xa6, 0x14, 0xdf, 0x6d, 0x54, 0xe6, 0x2d, 0x9f, 0x73, 0xc1, 0xa, 0xb8, 0x81, 0x33, 0xf8, 0x4a, 0x8a, 0x38, 0xf3, 0x41, 0x78, 0xca, 0x1, 0xb3, 0x9c, 0x2e, 0xe5, 0x57, 0x6e, 0xdc, 0x17, 0xa5, 0x65, 0xd7, 0x1c, 0xae, 0x97, 0x25, 0xee, 0x5c}, + {0x0, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x7, 0xb4, 0x7c, 0xcf, 0xff, 0x4c, 0x84, 0x37, 0x9, 0xba, 0x72, 0xc1, 0xe, 0xbd, 0x75, 0xc6, 0xf8, 0x4b, 0x83, 0x30, 0xe3, 0x50, 0x98, 0x2b, 0x15, 0xa6, 0x6e, 0xdd, 0x12, 0xa1, 0x69, 0xda, 0xe4, 0x57, 0x9f, 0x2c, 0x1c, 0xaf, 0x67, 0xd4, 0xea, 0x59, 0x91, 0x22, 0xed, 0x5e, 0x96, 0x25, 0x1b, 0xa8, 0x60, 0xd3, 0xdb, 0x68, 0xa0, 0x13, 0x2d, 0x9e, 0x56, 0xe5, 0x2a, 0x99, 0x51, 0xe2, 0xdc, 0x6f, 0xa7, 0x14, 0x24, 0x97, 0x5f, 0xec, 0xd2, 0x61, 0xa9, 0x1a, 0xd5, 0x66, 0xae, 0x1d, 0x23, 0x90, 0x58, 0xeb, 0x38, 0x8b, 0x43, 0xf0, 0xce, 0x7d, 0xb5, 0x6, 0xc9, 0x7a, 0xb2, 0x1, 0x3f, 0x8c, 0x44, 0xf7, 0xc7, 0x74, 0xbc, 0xf, 0x31, 0x82, 0x4a, 0xf9, 0x36, 0x85, 0x4d, 0xfe, 0xc0, 0x73, 0xbb, 0x8, 0xab, 0x18, 0xd0, 0x63, 0x5d, 0xee, 0x26, 0x95, 0x5a, 0xe9, 0x21, 0x92, 0xac, 0x1f, 0xd7, 0x64, 0x54, 0xe7, 0x2f, 0x9c, 0xa2, 0x11, 0xd9, 0x6a, 0xa5, 0x16, 0xde, 0x6d, 0x53, 0xe0, 0x28, 0x9b, 0x48, 0xfb, 0x33, 0x80, 0xbe, 0xd, 0xc5, 0x76, 0xb9, 0xa, 0xc2, 0x71, 0x4f, 0xfc, 0x34, 0x87, 0xb7, 0x4, 0xcc, 0x7f, 0x41, 0xf2, 0x3a, 0x89, 0x46, 0xf5, 0x3d, 0x8e, 0xb0, 0x3, 0xcb, 0x78, 0x70, 0xc3, 0xb, 0xb8, 0x86, 0x35, 0xfd, 0x4e, 0x81, 0x32, 0xfa, 0x49, 0x77, 0xc4, 0xc, 0xbf, 0x8f, 0x3c, 0xf4, 0x47, 0x79, 0xca, 0x2, 0xb1, 0x7e, 0xcd, 0x5, 0xb6, 0x88, 0x3b, 0xf3, 0x40, 0x93, 0x20, 0xe8, 0x5b, 0x65, 0xd6, 0x1e, 0xad, 0x62, 0xd1, 0x19, 0xaa, 0x94, 0x27, 0xef, 0x5c, 0x6c, 0xdf, 0x17, 0xa4, 0x9a, 0x29, 0xe1, 0x52, 0x9d, 0x2e, 0xe6, 0x55, 0x6b, 0xd8, 0x10, 0xa3}, + {0x0, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x8, 0x23, 0x97, 0x56, 0xe2, 0x8f, 0x3b, 0xfa, 0x4e, 0x65, 0xd1, 0x10, 0xa4, 0x46, 0xf2, 0x33, 0x87, 0xac, 0x18, 0xd9, 0x6d, 0x3, 0xb7, 0x76, 0xc2, 0xe9, 0x5d, 0x9c, 0x28, 0xca, 0x7e, 0xbf, 0xb, 0x20, 0x94, 0x55, 0xe1, 0x8c, 0x38, 0xf9, 0x4d, 0x66, 0xd2, 0x13, 0xa7, 0x45, 0xf1, 0x30, 0x84, 0xaf, 0x1b, 0xda, 0x6e, 0x6, 0xb2, 0x73, 0xc7, 0xec, 0x58, 0x99, 0x2d, 0xcf, 0x7b, 0xba, 0xe, 0x25, 0x91, 0x50, 0xe4, 0x89, 0x3d, 0xfc, 0x48, 0x63, 0xd7, 0x16, 0xa2, 0x40, 0xf4, 0x35, 0x81, 0xaa, 0x1e, 0xdf, 0x6b, 0x5, 0xb1, 0x70, 0xc4, 0xef, 0x5b, 0x9a, 0x2e, 0xcc, 0x78, 0xb9, 0xd, 0x26, 0x92, 0x53, 0xe7, 0x8a, 0x3e, 0xff, 0x4b, 0x60, 0xd4, 0x15, 0xa1, 0x43, 0xf7, 0x36, 0x82, 0xa9, 0x1d, 0xdc, 0x68, 0xc, 0xb8, 0x79, 0xcd, 0xe6, 0x52, 0x93, 0x27, 0xc5, 0x71, 0xb0, 0x4, 0x2f, 0x9b, 0x5a, 0xee, 0x83, 0x37, 0xf6, 0x42, 0x69, 0xdd, 0x1c, 0xa8, 0x4a, 0xfe, 0x3f, 0x8b, 0xa0, 0x14, 0xd5, 0x61, 0xf, 0xbb, 0x7a, 0xce, 0xe5, 0x51, 0x90, 0x24, 0xc6, 0x72, 0xb3, 0x7, 0x2c, 0x98, 0x59, 0xed, 0x80, 0x34, 0xf5, 0x41, 0x6a, 0xde, 0x1f, 0xab, 0x49, 0xfd, 0x3c, 0x88, 0xa3, 0x17, 0xd6, 0x62, 0xa, 0xbe, 0x7f, 0xcb, 0xe0, 0x54, 0x95, 0x21, 0xc3, 0x77, 0xb6, 0x2, 0x29, 0x9d, 0x5c, 0xe8, 0x85, 0x31, 0xf0, 0x44, 0x6f, 0xdb, 0x1a, 0xae, 0x4c, 0xf8, 0x39, 0x8d, 0xa6, 0x12, 0xd3, 0x67, 0x9, 0xbd, 0x7c, 0xc8, 0xe3, 0x57, 0x96, 0x22, 0xc0, 0x74, 0xb5, 0x1, 0x2a, 0x9e, 0x5f, 0xeb, 0x86, 0x32, 0xf3, 0x47, 0x6c, 0xd8, 0x19, 0xad, 0x4f, 0xfb, 0x3a, 0x8e, 0xa5, 0x11, 0xd0, 0x64}, + {0x0, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x3, 0x2f, 0x9a, 0x58, 0xed, 0x9f, 0x2a, 0xe8, 0x5d, 0x71, 0xc4, 0x6, 0xb3, 0x5e, 0xeb, 0x29, 0x9c, 0xb0, 0x5, 0xc7, 0x72, 0x23, 0x96, 0x54, 0xe1, 0xcd, 0x78, 0xba, 0xf, 0xe2, 0x57, 0x95, 0x20, 0xc, 0xb9, 0x7b, 0xce, 0xbc, 0x9, 0xcb, 0x7e, 0x52, 0xe7, 0x25, 0x90, 0x7d, 0xc8, 0xa, 0xbf, 0x93, 0x26, 0xe4, 0x51, 0x46, 0xf3, 0x31, 0x84, 0xa8, 0x1d, 0xdf, 0x6a, 0x87, 0x32, 0xf0, 0x45, 0x69, 0xdc, 0x1e, 0xab, 0xd9, 0x6c, 0xae, 0x1b, 0x37, 0x82, 0x40, 0xf5, 0x18, 0xad, 0x6f, 0xda, 0xf6, 0x43, 0x81, 0x34, 0x65, 0xd0, 0x12, 0xa7, 0x8b, 0x3e, 0xfc, 0x49, 0xa4, 0x11, 0xd3, 0x66, 0x4a, 0xff, 0x3d, 0x88, 0xfa, 0x4f, 0x8d, 0x38, 0x14, 0xa1, 0x63, 0xd6, 0x3b, 0x8e, 0x4c, 0xf9, 0xd5, 0x60, 0xa2, 0x17, 0x8c, 0x39, 0xfb, 0x4e, 0x62, 0xd7, 0x15, 0xa0, 0x4d, 0xf8, 0x3a, 0x8f, 0xa3, 0x16, 0xd4, 0x61, 0x13, 0xa6, 0x64, 0xd1, 0xfd, 0x48, 0x8a, 0x3f, 0xd2, 0x67, 0xa5, 0x10, 0x3c, 0x89, 0x4b, 0xfe, 0xaf, 0x1a, 0xd8, 0x6d, 0x41, 0xf4, 0x36, 0x83, 0x6e, 0xdb, 0x19, 0xac, 0x80, 0x35, 0xf7, 0x42, 0x30, 0x85, 0x47, 0xf2, 0xde, 0x6b, 0xa9, 0x1c, 0xf1, 0x44, 0x86, 0x33, 0x1f, 0xaa, 0x68, 0xdd, 0xca, 0x7f, 0xbd, 0x8, 0x24, 0x91, 0x53, 0xe6, 0xb, 0xbe, 0x7c, 0xc9, 0xe5, 0x50, 0x92, 0x27, 0x55, 0xe0, 0x22, 0x97, 0xbb, 0xe, 0xcc, 0x79, 0x94, 0x21, 0xe3, 0x56, 0x7a, 0xcf, 0xd, 0xb8, 0xe9, 0x5c, 0x9e, 0x2b, 0x7, 0xb2, 0x70, 0xc5, 0x28, 0x9d, 0x5f, 0xea, 0xc6, 0x73, 0xb1, 0x4, 0x76, 0xc3, 0x1, 0xb4, 0x98, 0x2d, 0xef, 0x5a, 0xb7, 0x2, 0xc0, 0x75, 0x59, 0xec, 0x2e, 0x9b}, + {0x0, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc, 0xaf, 0x19, 0xde, 0x68, 0x4d, 0xfb, 0x3c, 0x8a, 0x76, 0xc0, 0x7, 0xb1, 0x94, 0x22, 0xe5, 0x53, 0x43, 0xf5, 0x32, 0x84, 0xa1, 0x17, 0xd0, 0x66, 0x9a, 0x2c, 0xeb, 0x5d, 0x78, 0xce, 0x9, 0xbf, 0xec, 0x5a, 0x9d, 0x2b, 0xe, 0xb8, 0x7f, 0xc9, 0x35, 0x83, 0x44, 0xf2, 0xd7, 0x61, 0xa6, 0x10, 0x86, 0x30, 0xf7, 0x41, 0x64, 0xd2, 0x15, 0xa3, 0x5f, 0xe9, 0x2e, 0x98, 0xbd, 0xb, 0xcc, 0x7a, 0x29, 0x9f, 0x58, 0xee, 0xcb, 0x7d, 0xba, 0xc, 0xf0, 0x46, 0x81, 0x37, 0x12, 0xa4, 0x63, 0xd5, 0xc5, 0x73, 0xb4, 0x2, 0x27, 0x91, 0x56, 0xe0, 0x1c, 0xaa, 0x6d, 0xdb, 0xfe, 0x48, 0x8f, 0x39, 0x6a, 0xdc, 0x1b, 0xad, 0x88, 0x3e, 0xf9, 0x4f, 0xb3, 0x5, 0xc2, 0x74, 0x51, 0xe7, 0x20, 0x96, 0x11, 0xa7, 0x60, 0xd6, 0xf3, 0x45, 0x82, 0x34, 0xc8, 0x7e, 0xb9, 0xf, 0x2a, 0x9c, 0x5b, 0xed, 0xbe, 0x8, 0xcf, 0x79, 0x5c, 0xea, 0x2d, 0x9b, 0x67, 0xd1, 0x16, 0xa0, 0x85, 0x33, 0xf4, 0x42, 0x52, 0xe4, 0x23, 0x95, 0xb0, 0x6, 0xc1, 0x77, 0x8b, 0x3d, 0xfa, 0x4c, 0x69, 0xdf, 0x18, 0xae, 0xfd, 0x4b, 0x8c, 0x3a, 0x1f, 0xa9, 0x6e, 0xd8, 0x24, 0x92, 0x55, 0xe3, 0xc6, 0x70, 0xb7, 0x1, 0x97, 0x21, 0xe6, 0x50, 0x75, 0xc3, 0x4, 0xb2, 0x4e, 0xf8, 0x3f, 0x89, 0xac, 0x1a, 0xdd, 0x6b, 0x38, 0x8e, 0x49, 0xff, 0xda, 0x6c, 0xab, 0x1d, 0xe1, 0x57, 0x90, 0x26, 0x3, 0xb5, 0x72, 0xc4, 0xd4, 0x62, 0xa5, 0x13, 0x36, 0x80, 0x47, 0xf1, 0xd, 0xbb, 0x7c, 0xca, 0xef, 0x59, 0x9e, 0x28, 0x7b, 0xcd, 0xa, 0xbc, 0x99, 0x2f, 0xe8, 0x5e, 0xa2, 0x14, 0xd3, 0x65, 0x40, 0xf6, 0x31, 0x87}, + {0x0, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3, 0xbf, 0x8, 0xcc, 0x7b, 0x59, 0xee, 0x2a, 0x9d, 0x6e, 0xd9, 0x1d, 0xaa, 0x88, 0x3f, 0xfb, 0x4c, 0x63, 0xd4, 0x10, 0xa7, 0x85, 0x32, 0xf6, 0x41, 0xb2, 0x5, 0xc1, 0x76, 0x54, 0xe3, 0x27, 0x90, 0xdc, 0x6b, 0xaf, 0x18, 0x3a, 0x8d, 0x49, 0xfe, 0xd, 0xba, 0x7e, 0xc9, 0xeb, 0x5c, 0x98, 0x2f, 0xc6, 0x71, 0xb5, 0x2, 0x20, 0x97, 0x53, 0xe4, 0x17, 0xa0, 0x64, 0xd3, 0xf1, 0x46, 0x82, 0x35, 0x79, 0xce, 0xa, 0xbd, 0x9f, 0x28, 0xec, 0x5b, 0xa8, 0x1f, 0xdb, 0x6c, 0x4e, 0xf9, 0x3d, 0x8a, 0xa5, 0x12, 0xd6, 0x61, 0x43, 0xf4, 0x30, 0x87, 0x74, 0xc3, 0x7, 0xb0, 0x92, 0x25, 0xe1, 0x56, 0x1a, 0xad, 0x69, 0xde, 0xfc, 0x4b, 0x8f, 0x38, 0xcb, 0x7c, 0xb8, 0xf, 0x2d, 0x9a, 0x5e, 0xe9, 0x91, 0x26, 0xe2, 0x55, 0x77, 0xc0, 0x4, 0xb3, 0x40, 0xf7, 0x33, 0x84, 0xa6, 0x11, 0xd5, 0x62, 0x2e, 0x99, 0x5d, 0xea, 0xc8, 0x7f, 0xbb, 0xc, 0xff, 0x48, 0x8c, 0x3b, 0x19, 0xae, 0x6a, 0xdd, 0xf2, 0x45, 0x81, 0x36, 0x14, 0xa3, 0x67, 0xd0, 0x23, 0x94, 0x50, 0xe7, 0xc5, 0x72, 0xb6, 0x1, 0x4d, 0xfa, 0x3e, 0x89, 0xab, 0x1c, 0xd8, 0x6f, 0x9c, 0x2b, 0xef, 0x58, 0x7a, 0xcd, 0x9, 0xbe, 0x57, 0xe0, 0x24, 0x93, 0xb1, 0x6, 0xc2, 0x75, 0x86, 0x31, 0xf5, 0x42, 0x60, 0xd7, 0x13, 0xa4, 0xe8, 0x5f, 0x9b, 0x2c, 0xe, 0xb9, 0x7d, 0xca, 0x39, 0x8e, 0x4a, 0xfd, 0xdf, 0x68, 0xac, 0x1b, 0x34, 0x83, 0x47, 0xf0, 0xd2, 0x65, 0xa1, 0x16, 0xe5, 0x52, 0x96, 0x21, 0x3, 0xb4, 0x70, 0xc7, 0x8b, 0x3c, 0xf8, 0x4f, 0x6d, 0xda, 0x1e, 0xa9, 0x5a, 0xed, 0x29, 0x9e, 0xbc, 0xb, 0xcf, 0x78}, + {0x0, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0xf, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6, 0x4f, 0xf7, 0x22, 0x9a, 0x95, 0x2d, 0xf8, 0x40, 0xe6, 0x5e, 0x8b, 0x33, 0x3c, 0x84, 0x51, 0xe9, 0x9e, 0x26, 0xf3, 0x4b, 0x44, 0xfc, 0x29, 0x91, 0x37, 0x8f, 0x5a, 0xe2, 0xed, 0x55, 0x80, 0x38, 0xd1, 0x69, 0xbc, 0x4, 0xb, 0xb3, 0x66, 0xde, 0x78, 0xc0, 0x15, 0xad, 0xa2, 0x1a, 0xcf, 0x77, 0x21, 0x99, 0x4c, 0xf4, 0xfb, 0x43, 0x96, 0x2e, 0x88, 0x30, 0xe5, 0x5d, 0x52, 0xea, 0x3f, 0x87, 0x6e, 0xd6, 0x3, 0xbb, 0xb4, 0xc, 0xd9, 0x61, 0xc7, 0x7f, 0xaa, 0x12, 0x1d, 0xa5, 0x70, 0xc8, 0xbf, 0x7, 0xd2, 0x6a, 0x65, 0xdd, 0x8, 0xb0, 0x16, 0xae, 0x7b, 0xc3, 0xcc, 0x74, 0xa1, 0x19, 0xf0, 0x48, 0x9d, 0x25, 0x2a, 0x92, 0x47, 0xff, 0x59, 0xe1, 0x34, 0x8c, 0x83, 0x3b, 0xee, 0x56, 0x42, 0xfa, 0x2f, 0x97, 0x98, 0x20, 0xf5, 0x4d, 0xeb, 0x53, 0x86, 0x3e, 0x31, 0x89, 0x5c, 0xe4, 0xd, 0xb5, 0x60, 0xd8, 0xd7, 0x6f, 0xba, 0x2, 0xa4, 0x1c, 0xc9, 0x71, 0x7e, 0xc6, 0x13, 0xab, 0xdc, 0x64, 0xb1, 0x9, 0x6, 0xbe, 0x6b, 0xd3, 0x75, 0xcd, 0x18, 0xa0, 0xaf, 0x17, 0xc2, 0x7a, 0x93, 0x2b, 0xfe, 0x46, 0x49, 0xf1, 0x24, 0x9c, 0x3a, 0x82, 0x57, 0xef, 0xe0, 0x58, 0x8d, 0x35, 0x63, 0xdb, 0xe, 0xb6, 0xb9, 0x1, 0xd4, 0x6c, 0xca, 0x72, 0xa7, 0x1f, 0x10, 0xa8, 0x7d, 0xc5, 0x2c, 0x94, 0x41, 0xf9, 0xf6, 0x4e, 0x9b, 0x23, 0x85, 0x3d, 0xe8, 0x50, 0x5f, 0xe7, 0x32, 0x8a, 0xfd, 0x45, 0x90, 0x28, 0x27, 0x9f, 0x4a, 0xf2, 0x54, 0xec, 0x39, 0x81, 0x8e, 0x36, 0xe3, 0x5b, 0xb2, 0xa, 0xdf, 0x67, 0x68, 0xd0, 0x5, 0xbd, 0x1b, 0xa3, 0x76, 0xce, 0xc1, 0x79, 0xac, 0x14}, + {0x0, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x8, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9, 0x5f, 0xe6, 0x30, 0x89, 0x81, 0x38, 0xee, 0x57, 0xfe, 0x47, 0x91, 0x28, 0x20, 0x99, 0x4f, 0xf6, 0xbe, 0x7, 0xd1, 0x68, 0x60, 0xd9, 0xf, 0xb6, 0x1f, 0xa6, 0x70, 0xc9, 0xc1, 0x78, 0xae, 0x17, 0xe1, 0x58, 0x8e, 0x37, 0x3f, 0x86, 0x50, 0xe9, 0x40, 0xf9, 0x2f, 0x96, 0x9e, 0x27, 0xf1, 0x48, 0x61, 0xd8, 0xe, 0xb7, 0xbf, 0x6, 0xd0, 0x69, 0xc0, 0x79, 0xaf, 0x16, 0x1e, 0xa7, 0x71, 0xc8, 0x3e, 0x87, 0x51, 0xe8, 0xe0, 0x59, 0x8f, 0x36, 0x9f, 0x26, 0xf0, 0x49, 0x41, 0xf8, 0x2e, 0x97, 0xdf, 0x66, 0xb0, 0x9, 0x1, 0xb8, 0x6e, 0xd7, 0x7e, 0xc7, 0x11, 0xa8, 0xa0, 0x19, 0xcf, 0x76, 0x80, 0x39, 0xef, 0x56, 0x5e, 0xe7, 0x31, 0x88, 0x21, 0x98, 0x4e, 0xf7, 0xff, 0x46, 0x90, 0x29, 0xc2, 0x7b, 0xad, 0x14, 0x1c, 0xa5, 0x73, 0xca, 0x63, 0xda, 0xc, 0xb5, 0xbd, 0x4, 0xd2, 0x6b, 0x9d, 0x24, 0xf2, 0x4b, 0x43, 0xfa, 0x2c, 0x95, 0x3c, 0x85, 0x53, 0xea, 0xe2, 0x5b, 0x8d, 0x34, 0x7c, 0xc5, 0x13, 0xaa, 0xa2, 0x1b, 0xcd, 0x74, 0xdd, 0x64, 0xb2, 0xb, 0x3, 0xba, 0x6c, 0xd5, 0x23, 0x9a, 0x4c, 0xf5, 0xfd, 0x44, 0x92, 0x2b, 0x82, 0x3b, 0xed, 0x54, 0x5c, 0xe5, 0x33, 0x8a, 0xa3, 0x1a, 0xcc, 0x75, 0x7d, 0xc4, 0x12, 0xab, 0x2, 0xbb, 0x6d, 0xd4, 0xdc, 0x65, 0xb3, 0xa, 0xfc, 0x45, 0x93, 0x2a, 0x22, 0x9b, 0x4d, 0xf4, 0x5d, 0xe4, 0x32, 0x8b, 0x83, 0x3a, 0xec, 0x55, 0x1d, 0xa4, 0x72, 0xcb, 0xc3, 0x7a, 0xac, 0x15, 0xbc, 0x5, 0xd3, 0x6a, 0x62, 0xdb, 0xd, 0xb4, 0x42, 0xfb, 0x2d, 0x94, 0x9c, 0x25, 0xf3, 0x4a, 0xe3, 0x5a, 0x8c, 0x35, 0x3d, 0x84, 0x52, 0xeb}, + {0x0, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x1, 0xb9, 0x3, 0xd0, 0x6a, 0x6b, 0xd1, 0x2, 0xb8, 0x6f, 0xd5, 0x6, 0xbc, 0xbd, 0x7, 0xd4, 0x6e, 0xd6, 0x6c, 0xbf, 0x5, 0x4, 0xbe, 0x6d, 0xd7, 0xde, 0x64, 0xb7, 0xd, 0xc, 0xb6, 0x65, 0xdf, 0x67, 0xdd, 0xe, 0xb4, 0xb5, 0xf, 0xdc, 0x66, 0xb1, 0xb, 0xd8, 0x62, 0x63, 0xd9, 0xa, 0xb0, 0x8, 0xb2, 0x61, 0xdb, 0xda, 0x60, 0xb3, 0x9, 0xa1, 0x1b, 0xc8, 0x72, 0x73, 0xc9, 0x1a, 0xa0, 0x18, 0xa2, 0x71, 0xcb, 0xca, 0x70, 0xa3, 0x19, 0xce, 0x74, 0xa7, 0x1d, 0x1c, 0xa6, 0x75, 0xcf, 0x77, 0xcd, 0x1e, 0xa4, 0xa5, 0x1f, 0xcc, 0x76, 0x7f, 0xc5, 0x16, 0xac, 0xad, 0x17, 0xc4, 0x7e, 0xc6, 0x7c, 0xaf, 0x15, 0x14, 0xae, 0x7d, 0xc7, 0x10, 0xaa, 0x79, 0xc3, 0xc2, 0x78, 0xab, 0x11, 0xa9, 0x13, 0xc0, 0x7a, 0x7b, 0xc1, 0x12, 0xa8, 0x5f, 0xe5, 0x36, 0x8c, 0x8d, 0x37, 0xe4, 0x5e, 0xe6, 0x5c, 0x8f, 0x35, 0x34, 0x8e, 0x5d, 0xe7, 0x30, 0x8a, 0x59, 0xe3, 0xe2, 0x58, 0x8b, 0x31, 0x89, 0x33, 0xe0, 0x5a, 0x5b, 0xe1, 0x32, 0x88, 0x81, 0x3b, 0xe8, 0x52, 0x53, 0xe9, 0x3a, 0x80, 0x38, 0x82, 0x51, 0xeb, 0xea, 0x50, 0x83, 0x39, 0xee, 0x54, 0x87, 0x3d, 0x3c, 0x86, 0x55, 0xef, 0x57, 0xed, 0x3e, 0x84, 0x85, 0x3f, 0xec, 0x56, 0xfe, 0x44, 0x97, 0x2d, 0x2c, 0x96, 0x45, 0xff, 0x47, 0xfd, 0x2e, 0x94, 0x95, 0x2f, 0xfc, 0x46, 0x91, 0x2b, 0xf8, 0x42, 0x43, 0xf9, 0x2a, 0x90, 0x28, 0x92, 0x41, 0xfb, 0xfa, 0x40, 0x93, 0x29, 0x20, 0x9a, 0x49, 0xf3, 0xf2, 0x48, 0x9b, 0x21, 0x99, 0x23, 0xf0, 0x4a, 0x4b, 0xf1, 0x22, 0x98, 0x4f, 0xf5, 0x26, 0x9c, 0x9d, 0x27, 0xf4, 0x4e, 0xf6, 0x4c, 0x9f, 0x25, 0x24, 0x9e, 0x4d, 0xf7}, + {0x0, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x6, 0xb1, 0xa, 0xda, 0x61, 0x67, 0xdc, 0xc, 0xb7, 0x7f, 0xc4, 0x14, 0xaf, 0xa9, 0x12, 0xc2, 0x79, 0xce, 0x75, 0xa5, 0x1e, 0x18, 0xa3, 0x73, 0xc8, 0xfe, 0x45, 0x95, 0x2e, 0x28, 0x93, 0x43, 0xf8, 0x4f, 0xf4, 0x24, 0x9f, 0x99, 0x22, 0xf2, 0x49, 0x81, 0x3a, 0xea, 0x51, 0x57, 0xec, 0x3c, 0x87, 0x30, 0x8b, 0x5b, 0xe0, 0xe6, 0x5d, 0x8d, 0x36, 0xe1, 0x5a, 0x8a, 0x31, 0x37, 0x8c, 0x5c, 0xe7, 0x50, 0xeb, 0x3b, 0x80, 0x86, 0x3d, 0xed, 0x56, 0x9e, 0x25, 0xf5, 0x4e, 0x48, 0xf3, 0x23, 0x98, 0x2f, 0x94, 0x44, 0xff, 0xf9, 0x42, 0x92, 0x29, 0x1f, 0xa4, 0x74, 0xcf, 0xc9, 0x72, 0xa2, 0x19, 0xae, 0x15, 0xc5, 0x7e, 0x78, 0xc3, 0x13, 0xa8, 0x60, 0xdb, 0xb, 0xb0, 0xb6, 0xd, 0xdd, 0x66, 0xd1, 0x6a, 0xba, 0x1, 0x7, 0xbc, 0x6c, 0xd7, 0xdf, 0x64, 0xb4, 0xf, 0x9, 0xb2, 0x62, 0xd9, 0x6e, 0xd5, 0x5, 0xbe, 0xb8, 0x3, 0xd3, 0x68, 0xa0, 0x1b, 0xcb, 0x70, 0x76, 0xcd, 0x1d, 0xa6, 0x11, 0xaa, 0x7a, 0xc1, 0xc7, 0x7c, 0xac, 0x17, 0x21, 0x9a, 0x4a, 0xf1, 0xf7, 0x4c, 0x9c, 0x27, 0x90, 0x2b, 0xfb, 0x40, 0x46, 0xfd, 0x2d, 0x96, 0x5e, 0xe5, 0x35, 0x8e, 0x88, 0x33, 0xe3, 0x58, 0xef, 0x54, 0x84, 0x3f, 0x39, 0x82, 0x52, 0xe9, 0x3e, 0x85, 0x55, 0xee, 0xe8, 0x53, 0x83, 0x38, 0x8f, 0x34, 0xe4, 0x5f, 0x59, 0xe2, 0x32, 0x89, 0x41, 0xfa, 0x2a, 0x91, 0x97, 0x2c, 0xfc, 0x47, 0xf0, 0x4b, 0x9b, 0x20, 0x26, 0x9d, 0x4d, 0xf6, 0xc0, 0x7b, 0xab, 0x10, 0x16, 0xad, 0x7d, 0xc6, 0x71, 0xca, 0x1a, 0xa1, 0xa7, 0x1c, 0xcc, 0x77, 0xbf, 0x4, 0xd4, 0x6f, 0x69, 0xd2, 0x2, 0xb9, 0xe, 0xb5, 0x65, 0xde, 0xd8, 0x63, 0xb3, 0x8}, + {0x0, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a, 0xf, 0xb3, 0x6a, 0xd6, 0xc5, 0x79, 0xa0, 0x1c, 0x86, 0x3a, 0xe3, 0x5f, 0x4c, 0xf0, 0x29, 0x95, 0x1e, 0xa2, 0x7b, 0xc7, 0xd4, 0x68, 0xb1, 0xd, 0x97, 0x2b, 0xf2, 0x4e, 0x5d, 0xe1, 0x38, 0x84, 0x11, 0xad, 0x74, 0xc8, 0xdb, 0x67, 0xbe, 0x2, 0x98, 0x24, 0xfd, 0x41, 0x52, 0xee, 0x37, 0x8b, 0x3c, 0x80, 0x59, 0xe5, 0xf6, 0x4a, 0x93, 0x2f, 0xb5, 0x9, 0xd0, 0x6c, 0x7f, 0xc3, 0x1a, 0xa6, 0x33, 0x8f, 0x56, 0xea, 0xf9, 0x45, 0x9c, 0x20, 0xba, 0x6, 0xdf, 0x63, 0x70, 0xcc, 0x15, 0xa9, 0x22, 0x9e, 0x47, 0xfb, 0xe8, 0x54, 0x8d, 0x31, 0xab, 0x17, 0xce, 0x72, 0x61, 0xdd, 0x4, 0xb8, 0x2d, 0x91, 0x48, 0xf4, 0xe7, 0x5b, 0x82, 0x3e, 0xa4, 0x18, 0xc1, 0x7d, 0x6e, 0xd2, 0xb, 0xb7, 0x78, 0xc4, 0x1d, 0xa1, 0xb2, 0xe, 0xd7, 0x6b, 0xf1, 0x4d, 0x94, 0x28, 0x3b, 0x87, 0x5e, 0xe2, 0x77, 0xcb, 0x12, 0xae, 0xbd, 0x1, 0xd8, 0x64, 0xfe, 0x42, 0x9b, 0x27, 0x34, 0x88, 0x51, 0xed, 0x66, 0xda, 0x3, 0xbf, 0xac, 0x10, 0xc9, 0x75, 0xef, 0x53, 0x8a, 0x36, 0x25, 0x99, 0x40, 0xfc, 0x69, 0xd5, 0xc, 0xb0, 0xa3, 0x1f, 0xc6, 0x7a, 0xe0, 0x5c, 0x85, 0x39, 0x2a, 0x96, 0x4f, 0xf3, 0x44, 0xf8, 0x21, 0x9d, 0x8e, 0x32, 0xeb, 0x57, 0xcd, 0x71, 0xa8, 0x14, 0x7, 0xbb, 0x62, 0xde, 0x4b, 0xf7, 0x2e, 0x92, 0x81, 0x3d, 0xe4, 0x58, 0xc2, 0x7e, 0xa7, 0x1b, 0x8, 0xb4, 0x6d, 0xd1, 0x5a, 0xe6, 0x3f, 0x83, 0x90, 0x2c, 0xf5, 0x49, 0xd3, 0x6f, 0xb6, 0xa, 0x19, 0xa5, 0x7c, 0xc0, 0x55, 0xe9, 0x30, 0x8c, 0x9f, 0x23, 0xfa, 0x46, 0xdc, 0x60, 0xb9, 0x5, 0x16, 0xaa, 0x73, 0xcf}, + {0x0, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95, 0x1f, 0xa2, 0x78, 0xc5, 0xd1, 0x6c, 0xb6, 0xb, 0x9e, 0x23, 0xf9, 0x44, 0x50, 0xed, 0x37, 0x8a, 0x3e, 0x83, 0x59, 0xe4, 0xf0, 0x4d, 0x97, 0x2a, 0xbf, 0x2, 0xd8, 0x65, 0x71, 0xcc, 0x16, 0xab, 0x21, 0x9c, 0x46, 0xfb, 0xef, 0x52, 0x88, 0x35, 0xa0, 0x1d, 0xc7, 0x7a, 0x6e, 0xd3, 0x9, 0xb4, 0x7c, 0xc1, 0x1b, 0xa6, 0xb2, 0xf, 0xd5, 0x68, 0xfd, 0x40, 0x9a, 0x27, 0x33, 0x8e, 0x54, 0xe9, 0x63, 0xde, 0x4, 0xb9, 0xad, 0x10, 0xca, 0x77, 0xe2, 0x5f, 0x85, 0x38, 0x2c, 0x91, 0x4b, 0xf6, 0x42, 0xff, 0x25, 0x98, 0x8c, 0x31, 0xeb, 0x56, 0xc3, 0x7e, 0xa4, 0x19, 0xd, 0xb0, 0x6a, 0xd7, 0x5d, 0xe0, 0x3a, 0x87, 0x93, 0x2e, 0xf4, 0x49, 0xdc, 0x61, 0xbb, 0x6, 0x12, 0xaf, 0x75, 0xc8, 0xf8, 0x45, 0x9f, 0x22, 0x36, 0x8b, 0x51, 0xec, 0x79, 0xc4, 0x1e, 0xa3, 0xb7, 0xa, 0xd0, 0x6d, 0xe7, 0x5a, 0x80, 0x3d, 0x29, 0x94, 0x4e, 0xf3, 0x66, 0xdb, 0x1, 0xbc, 0xa8, 0x15, 0xcf, 0x72, 0xc6, 0x7b, 0xa1, 0x1c, 0x8, 0xb5, 0x6f, 0xd2, 0x47, 0xfa, 0x20, 0x9d, 0x89, 0x34, 0xee, 0x53, 0xd9, 0x64, 0xbe, 0x3, 0x17, 0xaa, 0x70, 0xcd, 0x58, 0xe5, 0x3f, 0x82, 0x96, 0x2b, 0xf1, 0x4c, 0x84, 0x39, 0xe3, 0x5e, 0x4a, 0xf7, 0x2d, 0x90, 0x5, 0xb8, 0x62, 0xdf, 0xcb, 0x76, 0xac, 0x11, 0x9b, 0x26, 0xfc, 0x41, 0x55, 0xe8, 0x32, 0x8f, 0x1a, 0xa7, 0x7d, 0xc0, 0xd4, 0x69, 0xb3, 0xe, 0xba, 0x7, 0xdd, 0x60, 0x74, 0xc9, 0x13, 0xae, 0x3b, 0x86, 0x5c, 0xe1, 0xf5, 0x48, 0x92, 0x2f, 0xa5, 0x18, 0xc2, 0x7f, 0x6b, 0xd6, 0xc, 0xb1, 0x24, 0x99, 0x43, 0xfe, 0xea, 0x57, 0x8d, 0x30}, + {0x0, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84, 0x2f, 0x91, 0x4e, 0xf0, 0xed, 0x53, 0x8c, 0x32, 0xb6, 0x8, 0xd7, 0x69, 0x74, 0xca, 0x15, 0xab, 0x5e, 0xe0, 0x3f, 0x81, 0x9c, 0x22, 0xfd, 0x43, 0xc7, 0x79, 0xa6, 0x18, 0x5, 0xbb, 0x64, 0xda, 0x71, 0xcf, 0x10, 0xae, 0xb3, 0xd, 0xd2, 0x6c, 0xe8, 0x56, 0x89, 0x37, 0x2a, 0x94, 0x4b, 0xf5, 0xbc, 0x2, 0xdd, 0x63, 0x7e, 0xc0, 0x1f, 0xa1, 0x25, 0x9b, 0x44, 0xfa, 0xe7, 0x59, 0x86, 0x38, 0x93, 0x2d, 0xf2, 0x4c, 0x51, 0xef, 0x30, 0x8e, 0xa, 0xb4, 0x6b, 0xd5, 0xc8, 0x76, 0xa9, 0x17, 0xe2, 0x5c, 0x83, 0x3d, 0x20, 0x9e, 0x41, 0xff, 0x7b, 0xc5, 0x1a, 0xa4, 0xb9, 0x7, 0xd8, 0x66, 0xcd, 0x73, 0xac, 0x12, 0xf, 0xb1, 0x6e, 0xd0, 0x54, 0xea, 0x35, 0x8b, 0x96, 0x28, 0xf7, 0x49, 0x65, 0xdb, 0x4, 0xba, 0xa7, 0x19, 0xc6, 0x78, 0xfc, 0x42, 0x9d, 0x23, 0x3e, 0x80, 0x5f, 0xe1, 0x4a, 0xf4, 0x2b, 0x95, 0x88, 0x36, 0xe9, 0x57, 0xd3, 0x6d, 0xb2, 0xc, 0x11, 0xaf, 0x70, 0xce, 0x3b, 0x85, 0x5a, 0xe4, 0xf9, 0x47, 0x98, 0x26, 0xa2, 0x1c, 0xc3, 0x7d, 0x60, 0xde, 0x1, 0xbf, 0x14, 0xaa, 0x75, 0xcb, 0xd6, 0x68, 0xb7, 0x9, 0x8d, 0x33, 0xec, 0x52, 0x4f, 0xf1, 0x2e, 0x90, 0xd9, 0x67, 0xb8, 0x6, 0x1b, 0xa5, 0x7a, 0xc4, 0x40, 0xfe, 0x21, 0x9f, 0x82, 0x3c, 0xe3, 0x5d, 0xf6, 0x48, 0x97, 0x29, 0x34, 0x8a, 0x55, 0xeb, 0x6f, 0xd1, 0xe, 0xb0, 0xad, 0x13, 0xcc, 0x72, 0x87, 0x39, 0xe6, 0x58, 0x45, 0xfb, 0x24, 0x9a, 0x1e, 0xa0, 0x7f, 0xc1, 0xdc, 0x62, 0xbd, 0x3, 0xa8, 0x16, 0xc9, 0x77, 0x6a, 0xd4, 0xb, 0xb5, 0x31, 0x8f, 0x50, 0xee, 0xf3, 0x4d, 0x92, 0x2c}, + {0x0, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b, 0x3f, 0x80, 0x5c, 0xe3, 0xf9, 0x46, 0x9a, 0x25, 0xae, 0x11, 0xcd, 0x72, 0x68, 0xd7, 0xb, 0xb4, 0x7e, 0xc1, 0x1d, 0xa2, 0xb8, 0x7, 0xdb, 0x64, 0xef, 0x50, 0x8c, 0x33, 0x29, 0x96, 0x4a, 0xf5, 0x41, 0xfe, 0x22, 0x9d, 0x87, 0x38, 0xe4, 0x5b, 0xd0, 0x6f, 0xb3, 0xc, 0x16, 0xa9, 0x75, 0xca, 0xfc, 0x43, 0x9f, 0x20, 0x3a, 0x85, 0x59, 0xe6, 0x6d, 0xd2, 0xe, 0xb1, 0xab, 0x14, 0xc8, 0x77, 0xc3, 0x7c, 0xa0, 0x1f, 0x5, 0xba, 0x66, 0xd9, 0x52, 0xed, 0x31, 0x8e, 0x94, 0x2b, 0xf7, 0x48, 0x82, 0x3d, 0xe1, 0x5e, 0x44, 0xfb, 0x27, 0x98, 0x13, 0xac, 0x70, 0xcf, 0xd5, 0x6a, 0xb6, 0x9, 0xbd, 0x2, 0xde, 0x61, 0x7b, 0xc4, 0x18, 0xa7, 0x2c, 0x93, 0x4f, 0xf0, 0xea, 0x55, 0x89, 0x36, 0xe5, 0x5a, 0x86, 0x39, 0x23, 0x9c, 0x40, 0xff, 0x74, 0xcb, 0x17, 0xa8, 0xb2, 0xd, 0xd1, 0x6e, 0xda, 0x65, 0xb9, 0x6, 0x1c, 0xa3, 0x7f, 0xc0, 0x4b, 0xf4, 0x28, 0x97, 0x8d, 0x32, 0xee, 0x51, 0x9b, 0x24, 0xf8, 0x47, 0x5d, 0xe2, 0x3e, 0x81, 0xa, 0xb5, 0x69, 0xd6, 0xcc, 0x73, 0xaf, 0x10, 0xa4, 0x1b, 0xc7, 0x78, 0x62, 0xdd, 0x1, 0xbe, 0x35, 0x8a, 0x56, 0xe9, 0xf3, 0x4c, 0x90, 0x2f, 0x19, 0xa6, 0x7a, 0xc5, 0xdf, 0x60, 0xbc, 0x3, 0x88, 0x37, 0xeb, 0x54, 0x4e, 0xf1, 0x2d, 0x92, 0x26, 0x99, 0x45, 0xfa, 0xe0, 0x5f, 0x83, 0x3c, 0xb7, 0x8, 0xd4, 0x6b, 0x71, 0xce, 0x12, 0xad, 0x67, 0xd8, 0x4, 0xbb, 0xa1, 0x1e, 0xc2, 0x7d, 0xf6, 0x49, 0x95, 0x2a, 0x30, 0x8f, 0x53, 0xec, 0x58, 0xe7, 0x3b, 0x84, 0x9e, 0x21, 0xfd, 0x42, 0xc9, 0x76, 0xaa, 0x15, 0xf, 0xb0, 0x6c, 0xd3}, + {0x0, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34, 0x9c, 0x5c, 0x1, 0xc1, 0xbb, 0x7b, 0x26, 0xe6, 0xd2, 0x12, 0x4f, 0x8f, 0xf5, 0x35, 0x68, 0xa8, 0x25, 0xe5, 0xb8, 0x78, 0x2, 0xc2, 0x9f, 0x5f, 0x6b, 0xab, 0xf6, 0x36, 0x4c, 0x8c, 0xd1, 0x11, 0xb9, 0x79, 0x24, 0xe4, 0x9e, 0x5e, 0x3, 0xc3, 0xf7, 0x37, 0x6a, 0xaa, 0xd0, 0x10, 0x4d, 0x8d, 0x4a, 0x8a, 0xd7, 0x17, 0x6d, 0xad, 0xf0, 0x30, 0x4, 0xc4, 0x99, 0x59, 0x23, 0xe3, 0xbe, 0x7e, 0xd6, 0x16, 0x4b, 0x8b, 0xf1, 0x31, 0x6c, 0xac, 0x98, 0x58, 0x5, 0xc5, 0xbf, 0x7f, 0x22, 0xe2, 0x6f, 0xaf, 0xf2, 0x32, 0x48, 0x88, 0xd5, 0x15, 0x21, 0xe1, 0xbc, 0x7c, 0x6, 0xc6, 0x9b, 0x5b, 0xf3, 0x33, 0x6e, 0xae, 0xd4, 0x14, 0x49, 0x89, 0xbd, 0x7d, 0x20, 0xe0, 0x9a, 0x5a, 0x7, 0xc7, 0x94, 0x54, 0x9, 0xc9, 0xb3, 0x73, 0x2e, 0xee, 0xda, 0x1a, 0x47, 0x87, 0xfd, 0x3d, 0x60, 0xa0, 0x8, 0xc8, 0x95, 0x55, 0x2f, 0xef, 0xb2, 0x72, 0x46, 0x86, 0xdb, 0x1b, 0x61, 0xa1, 0xfc, 0x3c, 0xb1, 0x71, 0x2c, 0xec, 0x96, 0x56, 0xb, 0xcb, 0xff, 0x3f, 0x62, 0xa2, 0xd8, 0x18, 0x45, 0x85, 0x2d, 0xed, 0xb0, 0x70, 0xa, 0xca, 0x97, 0x57, 0x63, 0xa3, 0xfe, 0x3e, 0x44, 0x84, 0xd9, 0x19, 0xde, 0x1e, 0x43, 0x83, 0xf9, 0x39, 0x64, 0xa4, 0x90, 0x50, 0xd, 0xcd, 0xb7, 0x77, 0x2a, 0xea, 0x42, 0x82, 0xdf, 0x1f, 0x65, 0xa5, 0xf8, 0x38, 0xc, 0xcc, 0x91, 0x51, 0x2b, 0xeb, 0xb6, 0x76, 0xfb, 0x3b, 0x66, 0xa6, 0xdc, 0x1c, 0x41, 0x81, 0xb5, 0x75, 0x28, 0xe8, 0x92, 0x52, 0xf, 0xcf, 0x67, 0xa7, 0xfa, 0x3a, 0x40, 0x80, 0xdd, 0x1d, 0x29, 0xe9, 0xb4, 0x74, 0xe, 0xce, 0x93, 0x53}, + {0x0, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b, 0x8c, 0x4d, 0x13, 0xd2, 0xaf, 0x6e, 0x30, 0xf1, 0xca, 0xb, 0x55, 0x94, 0xe9, 0x28, 0x76, 0xb7, 0x5, 0xc4, 0x9a, 0x5b, 0x26, 0xe7, 0xb9, 0x78, 0x43, 0x82, 0xdc, 0x1d, 0x60, 0xa1, 0xff, 0x3e, 0x89, 0x48, 0x16, 0xd7, 0xaa, 0x6b, 0x35, 0xf4, 0xcf, 0xe, 0x50, 0x91, 0xec, 0x2d, 0x73, 0xb2, 0xa, 0xcb, 0x95, 0x54, 0x29, 0xe8, 0xb6, 0x77, 0x4c, 0x8d, 0xd3, 0x12, 0x6f, 0xae, 0xf0, 0x31, 0x86, 0x47, 0x19, 0xd8, 0xa5, 0x64, 0x3a, 0xfb, 0xc0, 0x1, 0x5f, 0x9e, 0xe3, 0x22, 0x7c, 0xbd, 0xf, 0xce, 0x90, 0x51, 0x2c, 0xed, 0xb3, 0x72, 0x49, 0x88, 0xd6, 0x17, 0x6a, 0xab, 0xf5, 0x34, 0x83, 0x42, 0x1c, 0xdd, 0xa0, 0x61, 0x3f, 0xfe, 0xc5, 0x4, 0x5a, 0x9b, 0xe6, 0x27, 0x79, 0xb8, 0x14, 0xd5, 0x8b, 0x4a, 0x37, 0xf6, 0xa8, 0x69, 0x52, 0x93, 0xcd, 0xc, 0x71, 0xb0, 0xee, 0x2f, 0x98, 0x59, 0x7, 0xc6, 0xbb, 0x7a, 0x24, 0xe5, 0xde, 0x1f, 0x41, 0x80, 0xfd, 0x3c, 0x62, 0xa3, 0x11, 0xd0, 0x8e, 0x4f, 0x32, 0xf3, 0xad, 0x6c, 0x57, 0x96, 0xc8, 0x9, 0x74, 0xb5, 0xeb, 0x2a, 0x9d, 0x5c, 0x2, 0xc3, 0xbe, 0x7f, 0x21, 0xe0, 0xdb, 0x1a, 0x44, 0x85, 0xf8, 0x39, 0x67, 0xa6, 0x1e, 0xdf, 0x81, 0x40, 0x3d, 0xfc, 0xa2, 0x63, 0x58, 0x99, 0xc7, 0x6, 0x7b, 0xba, 0xe4, 0x25, 0x92, 0x53, 0xd, 0xcc, 0xb1, 0x70, 0x2e, 0xef, 0xd4, 0x15, 0x4b, 0x8a, 0xf7, 0x36, 0x68, 0xa9, 0x1b, 0xda, 0x84, 0x45, 0x38, 0xf9, 0xa7, 0x66, 0x5d, 0x9c, 0xc2, 0x3, 0x7e, 0xbf, 0xe1, 0x20, 0x97, 0x56, 0x8, 0xc9, 0xb4, 0x75, 0x2b, 0xea, 0xd1, 0x10, 0x4e, 0x8f, 0xf2, 0x33, 0x6d, 0xac}, + {0x0, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x5, 0x71, 0xb3, 0xe8, 0x2a, 0xbc, 0x7e, 0x25, 0xe7, 0x93, 0x51, 0xa, 0xc8, 0xe2, 0x20, 0x7b, 0xb9, 0xcd, 0xf, 0x54, 0x96, 0x65, 0xa7, 0xfc, 0x3e, 0x4a, 0x88, 0xd3, 0x11, 0x3b, 0xf9, 0xa2, 0x60, 0x14, 0xd6, 0x8d, 0x4f, 0xd9, 0x1b, 0x40, 0x82, 0xf6, 0x34, 0x6f, 0xad, 0x87, 0x45, 0x1e, 0xdc, 0xa8, 0x6a, 0x31, 0xf3, 0xca, 0x8, 0x53, 0x91, 0xe5, 0x27, 0x7c, 0xbe, 0x94, 0x56, 0xd, 0xcf, 0xbb, 0x79, 0x22, 0xe0, 0x76, 0xb4, 0xef, 0x2d, 0x59, 0x9b, 0xc0, 0x2, 0x28, 0xea, 0xb1, 0x73, 0x7, 0xc5, 0x9e, 0x5c, 0xaf, 0x6d, 0x36, 0xf4, 0x80, 0x42, 0x19, 0xdb, 0xf1, 0x33, 0x68, 0xaa, 0xde, 0x1c, 0x47, 0x85, 0x13, 0xd1, 0x8a, 0x48, 0x3c, 0xfe, 0xa5, 0x67, 0x4d, 0x8f, 0xd4, 0x16, 0x62, 0xa0, 0xfb, 0x39, 0x89, 0x4b, 0x10, 0xd2, 0xa6, 0x64, 0x3f, 0xfd, 0xd7, 0x15, 0x4e, 0x8c, 0xf8, 0x3a, 0x61, 0xa3, 0x35, 0xf7, 0xac, 0x6e, 0x1a, 0xd8, 0x83, 0x41, 0x6b, 0xa9, 0xf2, 0x30, 0x44, 0x86, 0xdd, 0x1f, 0xec, 0x2e, 0x75, 0xb7, 0xc3, 0x1, 0x5a, 0x98, 0xb2, 0x70, 0x2b, 0xe9, 0x9d, 0x5f, 0x4, 0xc6, 0x50, 0x92, 0xc9, 0xb, 0x7f, 0xbd, 0xe6, 0x24, 0xe, 0xcc, 0x97, 0x55, 0x21, 0xe3, 0xb8, 0x7a, 0x43, 0x81, 0xda, 0x18, 0x6c, 0xae, 0xf5, 0x37, 0x1d, 0xdf, 0x84, 0x46, 0x32, 0xf0, 0xab, 0x69, 0xff, 0x3d, 0x66, 0xa4, 0xd0, 0x12, 0x49, 0x8b, 0xa1, 0x63, 0x38, 0xfa, 0x8e, 0x4c, 0x17, 0xd5, 0x26, 0xe4, 0xbf, 0x7d, 0x9, 0xcb, 0x90, 0x52, 0x78, 0xba, 0xe1, 0x23, 0x57, 0x95, 0xce, 0xc, 0x9a, 0x58, 0x3, 0xc1, 0xb5, 0x77, 0x2c, 0xee, 0xc4, 0x6, 0x5d, 0x9f, 0xeb, 0x29, 0x72, 0xb0}, + {0x0, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0xe, 0x7d, 0xbe, 0xe6, 0x25, 0xac, 0x6f, 0x37, 0xf4, 0x87, 0x44, 0x1c, 0xdf, 0xfa, 0x39, 0x61, 0xa2, 0xd1, 0x12, 0x4a, 0x89, 0x45, 0x86, 0xde, 0x1d, 0x6e, 0xad, 0xf5, 0x36, 0x13, 0xd0, 0x88, 0x4b, 0x38, 0xfb, 0xa3, 0x60, 0xe9, 0x2a, 0x72, 0xb1, 0xc2, 0x1, 0x59, 0x9a, 0xbf, 0x7c, 0x24, 0xe7, 0x94, 0x57, 0xf, 0xcc, 0x8a, 0x49, 0x11, 0xd2, 0xa1, 0x62, 0x3a, 0xf9, 0xdc, 0x1f, 0x47, 0x84, 0xf7, 0x34, 0x6c, 0xaf, 0x26, 0xe5, 0xbd, 0x7e, 0xd, 0xce, 0x96, 0x55, 0x70, 0xb3, 0xeb, 0x28, 0x5b, 0x98, 0xc0, 0x3, 0xcf, 0xc, 0x54, 0x97, 0xe4, 0x27, 0x7f, 0xbc, 0x99, 0x5a, 0x2, 0xc1, 0xb2, 0x71, 0x29, 0xea, 0x63, 0xa0, 0xf8, 0x3b, 0x48, 0x8b, 0xd3, 0x10, 0x35, 0xf6, 0xae, 0x6d, 0x1e, 0xdd, 0x85, 0x46, 0x9, 0xca, 0x92, 0x51, 0x22, 0xe1, 0xb9, 0x7a, 0x5f, 0x9c, 0xc4, 0x7, 0x74, 0xb7, 0xef, 0x2c, 0xa5, 0x66, 0x3e, 0xfd, 0x8e, 0x4d, 0x15, 0xd6, 0xf3, 0x30, 0x68, 0xab, 0xd8, 0x1b, 0x43, 0x80, 0x4c, 0x8f, 0xd7, 0x14, 0x67, 0xa4, 0xfc, 0x3f, 0x1a, 0xd9, 0x81, 0x42, 0x31, 0xf2, 0xaa, 0x69, 0xe0, 0x23, 0x7b, 0xb8, 0xcb, 0x8, 0x50, 0x93, 0xb6, 0x75, 0x2d, 0xee, 0x9d, 0x5e, 0x6, 0xc5, 0x83, 0x40, 0x18, 0xdb, 0xa8, 0x6b, 0x33, 0xf0, 0xd5, 0x16, 0x4e, 0x8d, 0xfe, 0x3d, 0x65, 0xa6, 0x2f, 0xec, 0xb4, 0x77, 0x4, 0xc7, 0x9f, 0x5c, 0x79, 0xba, 0xe2, 0x21, 0x52, 0x91, 0xc9, 0xa, 0xc6, 0x5, 0x5d, 0x9e, 0xed, 0x2e, 0x76, 0xb5, 0x90, 0x53, 0xb, 0xc8, 0xbb, 0x78, 0x20, 0xe3, 0x6a, 0xa9, 0xf1, 0x32, 0x41, 0x82, 0xda, 0x19, 0x3c, 0xff, 0xa7, 0x64, 0x17, 0xd4, 0x8c, 0x4f}, + {0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x8, 0xdc, 0x18, 0x49, 0x8d, 0xeb, 0x2f, 0x7e, 0xba, 0xb2, 0x76, 0x27, 0xe3, 0x85, 0x41, 0x10, 0xd4, 0xa5, 0x61, 0x30, 0xf4, 0x92, 0x56, 0x7, 0xc3, 0xcb, 0xf, 0x5e, 0x9a, 0xfc, 0x38, 0x69, 0xad, 0x79, 0xbd, 0xec, 0x28, 0x4e, 0x8a, 0xdb, 0x1f, 0x17, 0xd3, 0x82, 0x46, 0x20, 0xe4, 0xb5, 0x71, 0x57, 0x93, 0xc2, 0x6, 0x60, 0xa4, 0xf5, 0x31, 0x39, 0xfd, 0xac, 0x68, 0xe, 0xca, 0x9b, 0x5f, 0x8b, 0x4f, 0x1e, 0xda, 0xbc, 0x78, 0x29, 0xed, 0xe5, 0x21, 0x70, 0xb4, 0xd2, 0x16, 0x47, 0x83, 0xf2, 0x36, 0x67, 0xa3, 0xc5, 0x1, 0x50, 0x94, 0x9c, 0x58, 0x9, 0xcd, 0xab, 0x6f, 0x3e, 0xfa, 0x2e, 0xea, 0xbb, 0x7f, 0x19, 0xdd, 0x8c, 0x48, 0x40, 0x84, 0xd5, 0x11, 0x77, 0xb3, 0xe2, 0x26, 0xae, 0x6a, 0x3b, 0xff, 0x99, 0x5d, 0xc, 0xc8, 0xc0, 0x4, 0x55, 0x91, 0xf7, 0x33, 0x62, 0xa6, 0x72, 0xb6, 0xe7, 0x23, 0x45, 0x81, 0xd0, 0x14, 0x1c, 0xd8, 0x89, 0x4d, 0x2b, 0xef, 0xbe, 0x7a, 0xb, 0xcf, 0x9e, 0x5a, 0x3c, 0xf8, 0xa9, 0x6d, 0x65, 0xa1, 0xf0, 0x34, 0x52, 0x96, 0xc7, 0x3, 0xd7, 0x13, 0x42, 0x86, 0xe0, 0x24, 0x75, 0xb1, 0xb9, 0x7d, 0x2c, 0xe8, 0x8e, 0x4a, 0x1b, 0xdf, 0xf9, 0x3d, 0x6c, 0xa8, 0xce, 0xa, 0x5b, 0x9f, 0x97, 0x53, 0x2, 0xc6, 0xa0, 0x64, 0x35, 0xf1, 0x25, 0xe1, 0xb0, 0x74, 0x12, 0xd6, 0x87, 0x43, 0x4b, 0x8f, 0xde, 0x1a, 0x7c, 0xb8, 0xe9, 0x2d, 0x5c, 0x98, 0xc9, 0xd, 0x6b, 0xaf, 0xfe, 0x3a, 0x32, 0xf6, 0xa7, 0x63, 0x5, 0xc1, 0x90, 0x54, 0x80, 0x44, 0x15, 0xd1, 0xb7, 0x73, 0x22, 0xe6, 0xee, 0x2a, 0x7b, 0xbf, 0xd9, 0x1d, 0x4c, 0x88}, + {0x0, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x7, 0xcc, 0x9, 0x5b, 0x9e, 0xff, 0x3a, 0x68, 0xad, 0xaa, 0x6f, 0x3d, 0xf8, 0x99, 0x5c, 0xe, 0xcb, 0x85, 0x40, 0x12, 0xd7, 0xb6, 0x73, 0x21, 0xe4, 0xe3, 0x26, 0x74, 0xb1, 0xd0, 0x15, 0x47, 0x82, 0x49, 0x8c, 0xde, 0x1b, 0x7a, 0xbf, 0xed, 0x28, 0x2f, 0xea, 0xb8, 0x7d, 0x1c, 0xd9, 0x8b, 0x4e, 0x17, 0xd2, 0x80, 0x45, 0x24, 0xe1, 0xb3, 0x76, 0x71, 0xb4, 0xe6, 0x23, 0x42, 0x87, 0xd5, 0x10, 0xdb, 0x1e, 0x4c, 0x89, 0xe8, 0x2d, 0x7f, 0xba, 0xbd, 0x78, 0x2a, 0xef, 0x8e, 0x4b, 0x19, 0xdc, 0x92, 0x57, 0x5, 0xc0, 0xa1, 0x64, 0x36, 0xf3, 0xf4, 0x31, 0x63, 0xa6, 0xc7, 0x2, 0x50, 0x95, 0x5e, 0x9b, 0xc9, 0xc, 0x6d, 0xa8, 0xfa, 0x3f, 0x38, 0xfd, 0xaf, 0x6a, 0xb, 0xce, 0x9c, 0x59, 0x2e, 0xeb, 0xb9, 0x7c, 0x1d, 0xd8, 0x8a, 0x4f, 0x48, 0x8d, 0xdf, 0x1a, 0x7b, 0xbe, 0xec, 0x29, 0xe2, 0x27, 0x75, 0xb0, 0xd1, 0x14, 0x46, 0x83, 0x84, 0x41, 0x13, 0xd6, 0xb7, 0x72, 0x20, 0xe5, 0xab, 0x6e, 0x3c, 0xf9, 0x98, 0x5d, 0xf, 0xca, 0xcd, 0x8, 0x5a, 0x9f, 0xfe, 0x3b, 0x69, 0xac, 0x67, 0xa2, 0xf0, 0x35, 0x54, 0x91, 0xc3, 0x6, 0x1, 0xc4, 0x96, 0x53, 0x32, 0xf7, 0xa5, 0x60, 0x39, 0xfc, 0xae, 0x6b, 0xa, 0xcf, 0x9d, 0x58, 0x5f, 0x9a, 0xc8, 0xd, 0x6c, 0xa9, 0xfb, 0x3e, 0xf5, 0x30, 0x62, 0xa7, 0xc6, 0x3, 0x51, 0x94, 0x93, 0x56, 0x4, 0xc1, 0xa0, 0x65, 0x37, 0xf2, 0xbc, 0x79, 0x2b, 0xee, 0x8f, 0x4a, 0x18, 0xdd, 0xda, 0x1f, 0x4d, 0x88, 0xe9, 0x2c, 0x7e, 0xbb, 0x70, 0xb5, 0xe7, 0x22, 0x43, 0x86, 0xd4, 0x11, 0x16, 0xd3, 0x81, 0x44, 0x25, 0xe0, 0xb2, 0x77}, + {0x0, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16, 0xfc, 0x3a, 0x6d, 0xab, 0xc3, 0x5, 0x52, 0x94, 0x82, 0x44, 0x13, 0xd5, 0xbd, 0x7b, 0x2c, 0xea, 0xe5, 0x23, 0x74, 0xb2, 0xda, 0x1c, 0x4b, 0x8d, 0x9b, 0x5d, 0xa, 0xcc, 0xa4, 0x62, 0x35, 0xf3, 0x19, 0xdf, 0x88, 0x4e, 0x26, 0xe0, 0xb7, 0x71, 0x67, 0xa1, 0xf6, 0x30, 0x58, 0x9e, 0xc9, 0xf, 0xd7, 0x11, 0x46, 0x80, 0xe8, 0x2e, 0x79, 0xbf, 0xa9, 0x6f, 0x38, 0xfe, 0x96, 0x50, 0x7, 0xc1, 0x2b, 0xed, 0xba, 0x7c, 0x14, 0xd2, 0x85, 0x43, 0x55, 0x93, 0xc4, 0x2, 0x6a, 0xac, 0xfb, 0x3d, 0x32, 0xf4, 0xa3, 0x65, 0xd, 0xcb, 0x9c, 0x5a, 0x4c, 0x8a, 0xdd, 0x1b, 0x73, 0xb5, 0xe2, 0x24, 0xce, 0x8, 0x5f, 0x99, 0xf1, 0x37, 0x60, 0xa6, 0xb0, 0x76, 0x21, 0xe7, 0x8f, 0x49, 0x1e, 0xd8, 0xb3, 0x75, 0x22, 0xe4, 0x8c, 0x4a, 0x1d, 0xdb, 0xcd, 0xb, 0x5c, 0x9a, 0xf2, 0x34, 0x63, 0xa5, 0x4f, 0x89, 0xde, 0x18, 0x70, 0xb6, 0xe1, 0x27, 0x31, 0xf7, 0xa0, 0x66, 0xe, 0xc8, 0x9f, 0x59, 0x56, 0x90, 0xc7, 0x1, 0x69, 0xaf, 0xf8, 0x3e, 0x28, 0xee, 0xb9, 0x7f, 0x17, 0xd1, 0x86, 0x40, 0xaa, 0x6c, 0x3b, 0xfd, 0x95, 0x53, 0x4, 0xc2, 0xd4, 0x12, 0x45, 0x83, 0xeb, 0x2d, 0x7a, 0xbc, 0x64, 0xa2, 0xf5, 0x33, 0x5b, 0x9d, 0xca, 0xc, 0x1a, 0xdc, 0x8b, 0x4d, 0x25, 0xe3, 0xb4, 0x72, 0x98, 0x5e, 0x9, 0xcf, 0xa7, 0x61, 0x36, 0xf0, 0xe6, 0x20, 0x77, 0xb1, 0xd9, 0x1f, 0x48, 0x8e, 0x81, 0x47, 0x10, 0xd6, 0xbe, 0x78, 0x2f, 0xe9, 0xff, 0x39, 0x6e, 0xa8, 0xc0, 0x6, 0x51, 0x97, 0x7d, 0xbb, 0xec, 0x2a, 0x42, 0x84, 0xd3, 0x15, 0x3, 0xc5, 0x92, 0x54, 0x3c, 0xfa, 0xad, 0x6b}, + {0x0, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19, 0xec, 0x2b, 0x7f, 0xb8, 0xd7, 0x10, 0x44, 0x83, 0x9a, 0x5d, 0x9, 0xce, 0xa1, 0x66, 0x32, 0xf5, 0xc5, 0x2, 0x56, 0x91, 0xfe, 0x39, 0x6d, 0xaa, 0xb3, 0x74, 0x20, 0xe7, 0x88, 0x4f, 0x1b, 0xdc, 0x29, 0xee, 0xba, 0x7d, 0x12, 0xd5, 0x81, 0x46, 0x5f, 0x98, 0xcc, 0xb, 0x64, 0xa3, 0xf7, 0x30, 0x97, 0x50, 0x4, 0xc3, 0xac, 0x6b, 0x3f, 0xf8, 0xe1, 0x26, 0x72, 0xb5, 0xda, 0x1d, 0x49, 0x8e, 0x7b, 0xbc, 0xe8, 0x2f, 0x40, 0x87, 0xd3, 0x14, 0xd, 0xca, 0x9e, 0x59, 0x36, 0xf1, 0xa5, 0x62, 0x52, 0x95, 0xc1, 0x6, 0x69, 0xae, 0xfa, 0x3d, 0x24, 0xe3, 0xb7, 0x70, 0x1f, 0xd8, 0x8c, 0x4b, 0xbe, 0x79, 0x2d, 0xea, 0x85, 0x42, 0x16, 0xd1, 0xc8, 0xf, 0x5b, 0x9c, 0xf3, 0x34, 0x60, 0xa7, 0x33, 0xf4, 0xa0, 0x67, 0x8, 0xcf, 0x9b, 0x5c, 0x45, 0x82, 0xd6, 0x11, 0x7e, 0xb9, 0xed, 0x2a, 0xdf, 0x18, 0x4c, 0x8b, 0xe4, 0x23, 0x77, 0xb0, 0xa9, 0x6e, 0x3a, 0xfd, 0x92, 0x55, 0x1, 0xc6, 0xf6, 0x31, 0x65, 0xa2, 0xcd, 0xa, 0x5e, 0x99, 0x80, 0x47, 0x13, 0xd4, 0xbb, 0x7c, 0x28, 0xef, 0x1a, 0xdd, 0x89, 0x4e, 0x21, 0xe6, 0xb2, 0x75, 0x6c, 0xab, 0xff, 0x38, 0x57, 0x90, 0xc4, 0x3, 0xa4, 0x63, 0x37, 0xf0, 0x9f, 0x58, 0xc, 0xcb, 0xd2, 0x15, 0x41, 0x86, 0xe9, 0x2e, 0x7a, 0xbd, 0x48, 0x8f, 0xdb, 0x1c, 0x73, 0xb4, 0xe0, 0x27, 0x3e, 0xf9, 0xad, 0x6a, 0x5, 0xc2, 0x96, 0x51, 0x61, 0xa6, 0xf2, 0x35, 0x5a, 0x9d, 0xc9, 0xe, 0x17, 0xd0, 0x84, 0x43, 0x2c, 0xeb, 0xbf, 0x78, 0x8d, 0x4a, 0x1e, 0xd9, 0xb6, 0x71, 0x25, 0xe2, 0xfb, 0x3c, 0x68, 0xaf, 0xc0, 0x7, 0x53, 0x94}, + {0x0, 0xc8, 0x8d, 0x45, 0x7, 0xcf, 0x8a, 0x42, 0xe, 0xc6, 0x83, 0x4b, 0x9, 0xc1, 0x84, 0x4c, 0x1c, 0xd4, 0x91, 0x59, 0x1b, 0xd3, 0x96, 0x5e, 0x12, 0xda, 0x9f, 0x57, 0x15, 0xdd, 0x98, 0x50, 0x38, 0xf0, 0xb5, 0x7d, 0x3f, 0xf7, 0xb2, 0x7a, 0x36, 0xfe, 0xbb, 0x73, 0x31, 0xf9, 0xbc, 0x74, 0x24, 0xec, 0xa9, 0x61, 0x23, 0xeb, 0xae, 0x66, 0x2a, 0xe2, 0xa7, 0x6f, 0x2d, 0xe5, 0xa0, 0x68, 0x70, 0xb8, 0xfd, 0x35, 0x77, 0xbf, 0xfa, 0x32, 0x7e, 0xb6, 0xf3, 0x3b, 0x79, 0xb1, 0xf4, 0x3c, 0x6c, 0xa4, 0xe1, 0x29, 0x6b, 0xa3, 0xe6, 0x2e, 0x62, 0xaa, 0xef, 0x27, 0x65, 0xad, 0xe8, 0x20, 0x48, 0x80, 0xc5, 0xd, 0x4f, 0x87, 0xc2, 0xa, 0x46, 0x8e, 0xcb, 0x3, 0x41, 0x89, 0xcc, 0x4, 0x54, 0x9c, 0xd9, 0x11, 0x53, 0x9b, 0xde, 0x16, 0x5a, 0x92, 0xd7, 0x1f, 0x5d, 0x95, 0xd0, 0x18, 0xe0, 0x28, 0x6d, 0xa5, 0xe7, 0x2f, 0x6a, 0xa2, 0xee, 0x26, 0x63, 0xab, 0xe9, 0x21, 0x64, 0xac, 0xfc, 0x34, 0x71, 0xb9, 0xfb, 0x33, 0x76, 0xbe, 0xf2, 0x3a, 0x7f, 0xb7, 0xf5, 0x3d, 0x78, 0xb0, 0xd8, 0x10, 0x55, 0x9d, 0xdf, 0x17, 0x52, 0x9a, 0xd6, 0x1e, 0x5b, 0x93, 0xd1, 0x19, 0x5c, 0x94, 0xc4, 0xc, 0x49, 0x81, 0xc3, 0xb, 0x4e, 0x86, 0xca, 0x2, 0x47, 0x8f, 0xcd, 0x5, 0x40, 0x88, 0x90, 0x58, 0x1d, 0xd5, 0x97, 0x5f, 0x1a, 0xd2, 0x9e, 0x56, 0x13, 0xdb, 0x99, 0x51, 0x14, 0xdc, 0x8c, 0x44, 0x1, 0xc9, 0x8b, 0x43, 0x6, 0xce, 0x82, 0x4a, 0xf, 0xc7, 0x85, 0x4d, 0x8, 0xc0, 0xa8, 0x60, 0x25, 0xed, 0xaf, 0x67, 0x22, 0xea, 0xa6, 0x6e, 0x2b, 0xe3, 0xa1, 0x69, 0x2c, 0xe4, 0xb4, 0x7c, 0x39, 0xf1, 0xb3, 0x7b, 0x3e, 0xf6, 0xba, 0x72, 0x37, 0xff, 0xbd, 0x75, 0x30, 0xf8}, + {0x0, 0xc9, 0x8f, 0x46, 0x3, 0xca, 0x8c, 0x45, 0x6, 0xcf, 0x89, 0x40, 0x5, 0xcc, 0x8a, 0x43, 0xc, 0xc5, 0x83, 0x4a, 0xf, 0xc6, 0x80, 0x49, 0xa, 0xc3, 0x85, 0x4c, 0x9, 0xc0, 0x86, 0x4f, 0x18, 0xd1, 0x97, 0x5e, 0x1b, 0xd2, 0x94, 0x5d, 0x1e, 0xd7, 0x91, 0x58, 0x1d, 0xd4, 0x92, 0x5b, 0x14, 0xdd, 0x9b, 0x52, 0x17, 0xde, 0x98, 0x51, 0x12, 0xdb, 0x9d, 0x54, 0x11, 0xd8, 0x9e, 0x57, 0x30, 0xf9, 0xbf, 0x76, 0x33, 0xfa, 0xbc, 0x75, 0x36, 0xff, 0xb9, 0x70, 0x35, 0xfc, 0xba, 0x73, 0x3c, 0xf5, 0xb3, 0x7a, 0x3f, 0xf6, 0xb0, 0x79, 0x3a, 0xf3, 0xb5, 0x7c, 0x39, 0xf0, 0xb6, 0x7f, 0x28, 0xe1, 0xa7, 0x6e, 0x2b, 0xe2, 0xa4, 0x6d, 0x2e, 0xe7, 0xa1, 0x68, 0x2d, 0xe4, 0xa2, 0x6b, 0x24, 0xed, 0xab, 0x62, 0x27, 0xee, 0xa8, 0x61, 0x22, 0xeb, 0xad, 0x64, 0x21, 0xe8, 0xae, 0x67, 0x60, 0xa9, 0xef, 0x26, 0x63, 0xaa, 0xec, 0x25, 0x66, 0xaf, 0xe9, 0x20, 0x65, 0xac, 0xea, 0x23, 0x6c, 0xa5, 0xe3, 0x2a, 0x6f, 0xa6, 0xe0, 0x29, 0x6a, 0xa3, 0xe5, 0x2c, 0x69, 0xa0, 0xe6, 0x2f, 0x78, 0xb1, 0xf7, 0x3e, 0x7b, 0xb2, 0xf4, 0x3d, 0x7e, 0xb7, 0xf1, 0x38, 0x7d, 0xb4, 0xf2, 0x3b, 0x74, 0xbd, 0xfb, 0x32, 0x77, 0xbe, 0xf8, 0x31, 0x72, 0xbb, 0xfd, 0x34, 0x71, 0xb8, 0xfe, 0x37, 0x50, 0x99, 0xdf, 0x16, 0x53, 0x9a, 0xdc, 0x15, 0x56, 0x9f, 0xd9, 0x10, 0x55, 0x9c, 0xda, 0x13, 0x5c, 0x95, 0xd3, 0x1a, 0x5f, 0x96, 0xd0, 0x19, 0x5a, 0x93, 0xd5, 0x1c, 0x59, 0x90, 0xd6, 0x1f, 0x48, 0x81, 0xc7, 0xe, 0x4b, 0x82, 0xc4, 0xd, 0x4e, 0x87, 0xc1, 0x8, 0x4d, 0x84, 0xc2, 0xb, 0x44, 0x8d, 0xcb, 0x2, 0x47, 0x8e, 0xc8, 0x1, 0x42, 0x8b, 0xcd, 0x4, 0x41, 0x88, 0xce, 0x7}, + {0x0, 0xca, 0x89, 0x43, 0xf, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52, 0x3c, 0xf6, 0xb5, 0x7f, 0x33, 0xf9, 0xba, 0x70, 0x22, 0xe8, 0xab, 0x61, 0x2d, 0xe7, 0xa4, 0x6e, 0x78, 0xb2, 0xf1, 0x3b, 0x77, 0xbd, 0xfe, 0x34, 0x66, 0xac, 0xef, 0x25, 0x69, 0xa3, 0xe0, 0x2a, 0x44, 0x8e, 0xcd, 0x7, 0x4b, 0x81, 0xc2, 0x8, 0x5a, 0x90, 0xd3, 0x19, 0x55, 0x9f, 0xdc, 0x16, 0xf0, 0x3a, 0x79, 0xb3, 0xff, 0x35, 0x76, 0xbc, 0xee, 0x24, 0x67, 0xad, 0xe1, 0x2b, 0x68, 0xa2, 0xcc, 0x6, 0x45, 0x8f, 0xc3, 0x9, 0x4a, 0x80, 0xd2, 0x18, 0x5b, 0x91, 0xdd, 0x17, 0x54, 0x9e, 0x88, 0x42, 0x1, 0xcb, 0x87, 0x4d, 0xe, 0xc4, 0x96, 0x5c, 0x1f, 0xd5, 0x99, 0x53, 0x10, 0xda, 0xb4, 0x7e, 0x3d, 0xf7, 0xbb, 0x71, 0x32, 0xf8, 0xaa, 0x60, 0x23, 0xe9, 0xa5, 0x6f, 0x2c, 0xe6, 0xfd, 0x37, 0x74, 0xbe, 0xf2, 0x38, 0x7b, 0xb1, 0xe3, 0x29, 0x6a, 0xa0, 0xec, 0x26, 0x65, 0xaf, 0xc1, 0xb, 0x48, 0x82, 0xce, 0x4, 0x47, 0x8d, 0xdf, 0x15, 0x56, 0x9c, 0xd0, 0x1a, 0x59, 0x93, 0x85, 0x4f, 0xc, 0xc6, 0x8a, 0x40, 0x3, 0xc9, 0x9b, 0x51, 0x12, 0xd8, 0x94, 0x5e, 0x1d, 0xd7, 0xb9, 0x73, 0x30, 0xfa, 0xb6, 0x7c, 0x3f, 0xf5, 0xa7, 0x6d, 0x2e, 0xe4, 0xa8, 0x62, 0x21, 0xeb, 0xd, 0xc7, 0x84, 0x4e, 0x2, 0xc8, 0x8b, 0x41, 0x13, 0xd9, 0x9a, 0x50, 0x1c, 0xd6, 0x95, 0x5f, 0x31, 0xfb, 0xb8, 0x72, 0x3e, 0xf4, 0xb7, 0x7d, 0x2f, 0xe5, 0xa6, 0x6c, 0x20, 0xea, 0xa9, 0x63, 0x75, 0xbf, 0xfc, 0x36, 0x7a, 0xb0, 0xf3, 0x39, 0x6b, 0xa1, 0xe2, 0x28, 0x64, 0xae, 0xed, 0x27, 0x49, 0x83, 0xc0, 0xa, 0x46, 0x8c, 0xcf, 0x5, 0x57, 0x9d, 0xde, 0x14, 0x58, 0x92, 0xd1, 0x1b}, + {0x0, 0xcb, 0x8b, 0x40, 0xb, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d, 0x2c, 0xe7, 0xa7, 0x6c, 0x27, 0xec, 0xac, 0x67, 0x3a, 0xf1, 0xb1, 0x7a, 0x31, 0xfa, 0xba, 0x71, 0x58, 0x93, 0xd3, 0x18, 0x53, 0x98, 0xd8, 0x13, 0x4e, 0x85, 0xc5, 0xe, 0x45, 0x8e, 0xce, 0x5, 0x74, 0xbf, 0xff, 0x34, 0x7f, 0xb4, 0xf4, 0x3f, 0x62, 0xa9, 0xe9, 0x22, 0x69, 0xa2, 0xe2, 0x29, 0xb0, 0x7b, 0x3b, 0xf0, 0xbb, 0x70, 0x30, 0xfb, 0xa6, 0x6d, 0x2d, 0xe6, 0xad, 0x66, 0x26, 0xed, 0x9c, 0x57, 0x17, 0xdc, 0x97, 0x5c, 0x1c, 0xd7, 0x8a, 0x41, 0x1, 0xca, 0x81, 0x4a, 0xa, 0xc1, 0xe8, 0x23, 0x63, 0xa8, 0xe3, 0x28, 0x68, 0xa3, 0xfe, 0x35, 0x75, 0xbe, 0xf5, 0x3e, 0x7e, 0xb5, 0xc4, 0xf, 0x4f, 0x84, 0xcf, 0x4, 0x44, 0x8f, 0xd2, 0x19, 0x59, 0x92, 0xd9, 0x12, 0x52, 0x99, 0x7d, 0xb6, 0xf6, 0x3d, 0x76, 0xbd, 0xfd, 0x36, 0x6b, 0xa0, 0xe0, 0x2b, 0x60, 0xab, 0xeb, 0x20, 0x51, 0x9a, 0xda, 0x11, 0x5a, 0x91, 0xd1, 0x1a, 0x47, 0x8c, 0xcc, 0x7, 0x4c, 0x87, 0xc7, 0xc, 0x25, 0xee, 0xae, 0x65, 0x2e, 0xe5, 0xa5, 0x6e, 0x33, 0xf8, 0xb8, 0x73, 0x38, 0xf3, 0xb3, 0x78, 0x9, 0xc2, 0x82, 0x49, 0x2, 0xc9, 0x89, 0x42, 0x1f, 0xd4, 0x94, 0x5f, 0x14, 0xdf, 0x9f, 0x54, 0xcd, 0x6, 0x46, 0x8d, 0xc6, 0xd, 0x4d, 0x86, 0xdb, 0x10, 0x50, 0x9b, 0xd0, 0x1b, 0x5b, 0x90, 0xe1, 0x2a, 0x6a, 0xa1, 0xea, 0x21, 0x61, 0xaa, 0xf7, 0x3c, 0x7c, 0xb7, 0xfc, 0x37, 0x77, 0xbc, 0x95, 0x5e, 0x1e, 0xd5, 0x9e, 0x55, 0x15, 0xde, 0x83, 0x48, 0x8, 0xc3, 0x88, 0x43, 0x3, 0xc8, 0xb9, 0x72, 0x32, 0xf9, 0xb2, 0x79, 0x39, 0xf2, 0xaf, 0x64, 0x24, 0xef, 0xa4, 0x6f, 0x2f, 0xe4}, + {0x0, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70, 0x5c, 0x90, 0xd9, 0x15, 0x4b, 0x87, 0xce, 0x2, 0x72, 0xbe, 0xf7, 0x3b, 0x65, 0xa9, 0xe0, 0x2c, 0xb8, 0x74, 0x3d, 0xf1, 0xaf, 0x63, 0x2a, 0xe6, 0x96, 0x5a, 0x13, 0xdf, 0x81, 0x4d, 0x4, 0xc8, 0xe4, 0x28, 0x61, 0xad, 0xf3, 0x3f, 0x76, 0xba, 0xca, 0x6, 0x4f, 0x83, 0xdd, 0x11, 0x58, 0x94, 0x6d, 0xa1, 0xe8, 0x24, 0x7a, 0xb6, 0xff, 0x33, 0x43, 0x8f, 0xc6, 0xa, 0x54, 0x98, 0xd1, 0x1d, 0x31, 0xfd, 0xb4, 0x78, 0x26, 0xea, 0xa3, 0x6f, 0x1f, 0xd3, 0x9a, 0x56, 0x8, 0xc4, 0x8d, 0x41, 0xd5, 0x19, 0x50, 0x9c, 0xc2, 0xe, 0x47, 0x8b, 0xfb, 0x37, 0x7e, 0xb2, 0xec, 0x20, 0x69, 0xa5, 0x89, 0x45, 0xc, 0xc0, 0x9e, 0x52, 0x1b, 0xd7, 0xa7, 0x6b, 0x22, 0xee, 0xb0, 0x7c, 0x35, 0xf9, 0xda, 0x16, 0x5f, 0x93, 0xcd, 0x1, 0x48, 0x84, 0xf4, 0x38, 0x71, 0xbd, 0xe3, 0x2f, 0x66, 0xaa, 0x86, 0x4a, 0x3, 0xcf, 0x91, 0x5d, 0x14, 0xd8, 0xa8, 0x64, 0x2d, 0xe1, 0xbf, 0x73, 0x3a, 0xf6, 0x62, 0xae, 0xe7, 0x2b, 0x75, 0xb9, 0xf0, 0x3c, 0x4c, 0x80, 0xc9, 0x5, 0x5b, 0x97, 0xde, 0x12, 0x3e, 0xf2, 0xbb, 0x77, 0x29, 0xe5, 0xac, 0x60, 0x10, 0xdc, 0x95, 0x59, 0x7, 0xcb, 0x82, 0x4e, 0xb7, 0x7b, 0x32, 0xfe, 0xa0, 0x6c, 0x25, 0xe9, 0x99, 0x55, 0x1c, 0xd0, 0x8e, 0x42, 0xb, 0xc7, 0xeb, 0x27, 0x6e, 0xa2, 0xfc, 0x30, 0x79, 0xb5, 0xc5, 0x9, 0x40, 0x8c, 0xd2, 0x1e, 0x57, 0x9b, 0xf, 0xc3, 0x8a, 0x46, 0x18, 0xd4, 0x9d, 0x51, 0x21, 0xed, 0xa4, 0x68, 0x36, 0xfa, 0xb3, 0x7f, 0x53, 0x9f, 0xd6, 0x1a, 0x44, 0x88, 0xc1, 0xd, 0x7d, 0xb1, 0xf8, 0x34, 0x6a, 0xa6, 0xef, 0x23}, + {0x0, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f, 0x4c, 0x81, 0xcb, 0x6, 0x5f, 0x92, 0xd8, 0x15, 0x6a, 0xa7, 0xed, 0x20, 0x79, 0xb4, 0xfe, 0x33, 0x98, 0x55, 0x1f, 0xd2, 0x8b, 0x46, 0xc, 0xc1, 0xbe, 0x73, 0x39, 0xf4, 0xad, 0x60, 0x2a, 0xe7, 0xd4, 0x19, 0x53, 0x9e, 0xc7, 0xa, 0x40, 0x8d, 0xf2, 0x3f, 0x75, 0xb8, 0xe1, 0x2c, 0x66, 0xab, 0x2d, 0xe0, 0xaa, 0x67, 0x3e, 0xf3, 0xb9, 0x74, 0xb, 0xc6, 0x8c, 0x41, 0x18, 0xd5, 0x9f, 0x52, 0x61, 0xac, 0xe6, 0x2b, 0x72, 0xbf, 0xf5, 0x38, 0x47, 0x8a, 0xc0, 0xd, 0x54, 0x99, 0xd3, 0x1e, 0xb5, 0x78, 0x32, 0xff, 0xa6, 0x6b, 0x21, 0xec, 0x93, 0x5e, 0x14, 0xd9, 0x80, 0x4d, 0x7, 0xca, 0xf9, 0x34, 0x7e, 0xb3, 0xea, 0x27, 0x6d, 0xa0, 0xdf, 0x12, 0x58, 0x95, 0xcc, 0x1, 0x4b, 0x86, 0x5a, 0x97, 0xdd, 0x10, 0x49, 0x84, 0xce, 0x3, 0x7c, 0xb1, 0xfb, 0x36, 0x6f, 0xa2, 0xe8, 0x25, 0x16, 0xdb, 0x91, 0x5c, 0x5, 0xc8, 0x82, 0x4f, 0x30, 0xfd, 0xb7, 0x7a, 0x23, 0xee, 0xa4, 0x69, 0xc2, 0xf, 0x45, 0x88, 0xd1, 0x1c, 0x56, 0x9b, 0xe4, 0x29, 0x63, 0xae, 0xf7, 0x3a, 0x70, 0xbd, 0x8e, 0x43, 0x9, 0xc4, 0x9d, 0x50, 0x1a, 0xd7, 0xa8, 0x65, 0x2f, 0xe2, 0xbb, 0x76, 0x3c, 0xf1, 0x77, 0xba, 0xf0, 0x3d, 0x64, 0xa9, 0xe3, 0x2e, 0x51, 0x9c, 0xd6, 0x1b, 0x42, 0x8f, 0xc5, 0x8, 0x3b, 0xf6, 0xbc, 0x71, 0x28, 0xe5, 0xaf, 0x62, 0x1d, 0xd0, 0x9a, 0x57, 0xe, 0xc3, 0x89, 0x44, 0xef, 0x22, 0x68, 0xa5, 0xfc, 0x31, 0x7b, 0xb6, 0xc9, 0x4, 0x4e, 0x83, 0xda, 0x17, 0x5d, 0x90, 0xa3, 0x6e, 0x24, 0xe9, 0xb0, 0x7d, 0x37, 0xfa, 0x85, 0x48, 0x2, 0xcf, 0x96, 0x5b, 0x11, 0xdc}, + {0x0, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e, 0x7c, 0xb2, 0xfd, 0x33, 0x63, 0xad, 0xe2, 0x2c, 0x42, 0x8c, 0xc3, 0xd, 0x5d, 0x93, 0xdc, 0x12, 0xf8, 0x36, 0x79, 0xb7, 0xe7, 0x29, 0x66, 0xa8, 0xc6, 0x8, 0x47, 0x89, 0xd9, 0x17, 0x58, 0x96, 0x84, 0x4a, 0x5, 0xcb, 0x9b, 0x55, 0x1a, 0xd4, 0xba, 0x74, 0x3b, 0xf5, 0xa5, 0x6b, 0x24, 0xea, 0xed, 0x23, 0x6c, 0xa2, 0xf2, 0x3c, 0x73, 0xbd, 0xd3, 0x1d, 0x52, 0x9c, 0xcc, 0x2, 0x4d, 0x83, 0x91, 0x5f, 0x10, 0xde, 0x8e, 0x40, 0xf, 0xc1, 0xaf, 0x61, 0x2e, 0xe0, 0xb0, 0x7e, 0x31, 0xff, 0x15, 0xdb, 0x94, 0x5a, 0xa, 0xc4, 0x8b, 0x45, 0x2b, 0xe5, 0xaa, 0x64, 0x34, 0xfa, 0xb5, 0x7b, 0x69, 0xa7, 0xe8, 0x26, 0x76, 0xb8, 0xf7, 0x39, 0x57, 0x99, 0xd6, 0x18, 0x48, 0x86, 0xc9, 0x7, 0xc7, 0x9, 0x46, 0x88, 0xd8, 0x16, 0x59, 0x97, 0xf9, 0x37, 0x78, 0xb6, 0xe6, 0x28, 0x67, 0xa9, 0xbb, 0x75, 0x3a, 0xf4, 0xa4, 0x6a, 0x25, 0xeb, 0x85, 0x4b, 0x4, 0xca, 0x9a, 0x54, 0x1b, 0xd5, 0x3f, 0xf1, 0xbe, 0x70, 0x20, 0xee, 0xa1, 0x6f, 0x1, 0xcf, 0x80, 0x4e, 0x1e, 0xd0, 0x9f, 0x51, 0x43, 0x8d, 0xc2, 0xc, 0x5c, 0x92, 0xdd, 0x13, 0x7d, 0xb3, 0xfc, 0x32, 0x62, 0xac, 0xe3, 0x2d, 0x2a, 0xe4, 0xab, 0x65, 0x35, 0xfb, 0xb4, 0x7a, 0x14, 0xda, 0x95, 0x5b, 0xb, 0xc5, 0x8a, 0x44, 0x56, 0x98, 0xd7, 0x19, 0x49, 0x87, 0xc8, 0x6, 0x68, 0xa6, 0xe9, 0x27, 0x77, 0xb9, 0xf6, 0x38, 0xd2, 0x1c, 0x53, 0x9d, 0xcd, 0x3, 0x4c, 0x82, 0xec, 0x22, 0x6d, 0xa3, 0xf3, 0x3d, 0x72, 0xbc, 0xae, 0x60, 0x2f, 0xe1, 0xb1, 0x7f, 0x30, 0xfe, 0x90, 0x5e, 0x11, 0xdf, 0x8f, 0x41, 0xe, 0xc0}, + {0x0, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61, 0x6c, 0xa3, 0xef, 0x20, 0x77, 0xb8, 0xf4, 0x3b, 0x5a, 0x95, 0xd9, 0x16, 0x41, 0x8e, 0xc2, 0xd, 0xd8, 0x17, 0x5b, 0x94, 0xc3, 0xc, 0x40, 0x8f, 0xee, 0x21, 0x6d, 0xa2, 0xf5, 0x3a, 0x76, 0xb9, 0xb4, 0x7b, 0x37, 0xf8, 0xaf, 0x60, 0x2c, 0xe3, 0x82, 0x4d, 0x1, 0xce, 0x99, 0x56, 0x1a, 0xd5, 0xad, 0x62, 0x2e, 0xe1, 0xb6, 0x79, 0x35, 0xfa, 0x9b, 0x54, 0x18, 0xd7, 0x80, 0x4f, 0x3, 0xcc, 0xc1, 0xe, 0x42, 0x8d, 0xda, 0x15, 0x59, 0x96, 0xf7, 0x38, 0x74, 0xbb, 0xec, 0x23, 0x6f, 0xa0, 0x75, 0xba, 0xf6, 0x39, 0x6e, 0xa1, 0xed, 0x22, 0x43, 0x8c, 0xc0, 0xf, 0x58, 0x97, 0xdb, 0x14, 0x19, 0xd6, 0x9a, 0x55, 0x2, 0xcd, 0x81, 0x4e, 0x2f, 0xe0, 0xac, 0x63, 0x34, 0xfb, 0xb7, 0x78, 0x47, 0x88, 0xc4, 0xb, 0x5c, 0x93, 0xdf, 0x10, 0x71, 0xbe, 0xf2, 0x3d, 0x6a, 0xa5, 0xe9, 0x26, 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c, 0x1d, 0xd2, 0x9e, 0x51, 0x6, 0xc9, 0x85, 0x4a, 0x9f, 0x50, 0x1c, 0xd3, 0x84, 0x4b, 0x7, 0xc8, 0xa9, 0x66, 0x2a, 0xe5, 0xb2, 0x7d, 0x31, 0xfe, 0xf3, 0x3c, 0x70, 0xbf, 0xe8, 0x27, 0x6b, 0xa4, 0xc5, 0xa, 0x46, 0x89, 0xde, 0x11, 0x5d, 0x92, 0xea, 0x25, 0x69, 0xa6, 0xf1, 0x3e, 0x72, 0xbd, 0xdc, 0x13, 0x5f, 0x90, 0xc7, 0x8, 0x44, 0x8b, 0x86, 0x49, 0x5, 0xca, 0x9d, 0x52, 0x1e, 0xd1, 0xb0, 0x7f, 0x33, 0xfc, 0xab, 0x64, 0x28, 0xe7, 0x32, 0xfd, 0xb1, 0x7e, 0x29, 0xe6, 0xaa, 0x65, 0x4, 0xcb, 0x87, 0x48, 0x1f, 0xd0, 0x9c, 0x53, 0x5e, 0x91, 0xdd, 0x12, 0x45, 0x8a, 0xc6, 0x9, 0x68, 0xa7, 0xeb, 0x24, 0x73, 0xbc, 0xf0, 0x3f}, + {0x0, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0xa, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4, 0x81, 0x51, 0x3c, 0xec, 0xe6, 0x36, 0x5b, 0x8b, 0x4f, 0x9f, 0xf2, 0x22, 0x28, 0xf8, 0x95, 0x45, 0x1f, 0xcf, 0xa2, 0x72, 0x78, 0xa8, 0xc5, 0x15, 0xd1, 0x1, 0x6c, 0xbc, 0xb6, 0x66, 0xb, 0xdb, 0x9e, 0x4e, 0x23, 0xf3, 0xf9, 0x29, 0x44, 0x94, 0x50, 0x80, 0xed, 0x3d, 0x37, 0xe7, 0x8a, 0x5a, 0x3e, 0xee, 0x83, 0x53, 0x59, 0x89, 0xe4, 0x34, 0xf0, 0x20, 0x4d, 0x9d, 0x97, 0x47, 0x2a, 0xfa, 0xbf, 0x6f, 0x2, 0xd2, 0xd8, 0x8, 0x65, 0xb5, 0x71, 0xa1, 0xcc, 0x1c, 0x16, 0xc6, 0xab, 0x7b, 0x21, 0xf1, 0x9c, 0x4c, 0x46, 0x96, 0xfb, 0x2b, 0xef, 0x3f, 0x52, 0x82, 0x88, 0x58, 0x35, 0xe5, 0xa0, 0x70, 0x1d, 0xcd, 0xc7, 0x17, 0x7a, 0xaa, 0x6e, 0xbe, 0xd3, 0x3, 0x9, 0xd9, 0xb4, 0x64, 0x7c, 0xac, 0xc1, 0x11, 0x1b, 0xcb, 0xa6, 0x76, 0xb2, 0x62, 0xf, 0xdf, 0xd5, 0x5, 0x68, 0xb8, 0xfd, 0x2d, 0x40, 0x90, 0x9a, 0x4a, 0x27, 0xf7, 0x33, 0xe3, 0x8e, 0x5e, 0x54, 0x84, 0xe9, 0x39, 0x63, 0xb3, 0xde, 0xe, 0x4, 0xd4, 0xb9, 0x69, 0xad, 0x7d, 0x10, 0xc0, 0xca, 0x1a, 0x77, 0xa7, 0xe2, 0x32, 0x5f, 0x8f, 0x85, 0x55, 0x38, 0xe8, 0x2c, 0xfc, 0x91, 0x41, 0x4b, 0x9b, 0xf6, 0x26, 0x42, 0x92, 0xff, 0x2f, 0x25, 0xf5, 0x98, 0x48, 0x8c, 0x5c, 0x31, 0xe1, 0xeb, 0x3b, 0x56, 0x86, 0xc3, 0x13, 0x7e, 0xae, 0xa4, 0x74, 0x19, 0xc9, 0xd, 0xdd, 0xb0, 0x60, 0x6a, 0xba, 0xd7, 0x7, 0x5d, 0x8d, 0xe0, 0x30, 0x3a, 0xea, 0x87, 0x57, 0x93, 0x43, 0x2e, 0xfe, 0xf4, 0x24, 0x49, 0x99, 0xdc, 0xc, 0x61, 0xb1, 0xbb, 0x6b, 0x6, 0xd6, 0x12, 0xc2, 0xaf, 0x7f, 0x75, 0xa5, 0xc8, 0x18}, + {0x0, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0xd, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb, 0x91, 0x40, 0x2e, 0xff, 0xf2, 0x23, 0x4d, 0x9c, 0x57, 0x86, 0xe8, 0x39, 0x34, 0xe5, 0x8b, 0x5a, 0x3f, 0xee, 0x80, 0x51, 0x5c, 0x8d, 0xe3, 0x32, 0xf9, 0x28, 0x46, 0x97, 0x9a, 0x4b, 0x25, 0xf4, 0xae, 0x7f, 0x11, 0xc0, 0xcd, 0x1c, 0x72, 0xa3, 0x68, 0xb9, 0xd7, 0x6, 0xb, 0xda, 0xb4, 0x65, 0x7e, 0xaf, 0xc1, 0x10, 0x1d, 0xcc, 0xa2, 0x73, 0xb8, 0x69, 0x7, 0xd6, 0xdb, 0xa, 0x64, 0xb5, 0xef, 0x3e, 0x50, 0x81, 0x8c, 0x5d, 0x33, 0xe2, 0x29, 0xf8, 0x96, 0x47, 0x4a, 0x9b, 0xf5, 0x24, 0x41, 0x90, 0xfe, 0x2f, 0x22, 0xf3, 0x9d, 0x4c, 0x87, 0x56, 0x38, 0xe9, 0xe4, 0x35, 0x5b, 0x8a, 0xd0, 0x1, 0x6f, 0xbe, 0xb3, 0x62, 0xc, 0xdd, 0x16, 0xc7, 0xa9, 0x78, 0x75, 0xa4, 0xca, 0x1b, 0xfc, 0x2d, 0x43, 0x92, 0x9f, 0x4e, 0x20, 0xf1, 0x3a, 0xeb, 0x85, 0x54, 0x59, 0x88, 0xe6, 0x37, 0x6d, 0xbc, 0xd2, 0x3, 0xe, 0xdf, 0xb1, 0x60, 0xab, 0x7a, 0x14, 0xc5, 0xc8, 0x19, 0x77, 0xa6, 0xc3, 0x12, 0x7c, 0xad, 0xa0, 0x71, 0x1f, 0xce, 0x5, 0xd4, 0xba, 0x6b, 0x66, 0xb7, 0xd9, 0x8, 0x52, 0x83, 0xed, 0x3c, 0x31, 0xe0, 0x8e, 0x5f, 0x94, 0x45, 0x2b, 0xfa, 0xf7, 0x26, 0x48, 0x99, 0x82, 0x53, 0x3d, 0xec, 0xe1, 0x30, 0x5e, 0x8f, 0x44, 0x95, 0xfb, 0x2a, 0x27, 0xf6, 0x98, 0x49, 0x13, 0xc2, 0xac, 0x7d, 0x70, 0xa1, 0xcf, 0x1e, 0xd5, 0x4, 0x6a, 0xbb, 0xb6, 0x67, 0x9, 0xd8, 0xbd, 0x6c, 0x2, 0xd3, 0xde, 0xf, 0x61, 0xb0, 0x7b, 0xaa, 0xc4, 0x15, 0x18, 0xc9, 0xa7, 0x76, 0x2c, 0xfd, 0x93, 0x42, 0x4f, 0x9e, 0xf0, 0x21, 0xea, 0x3b, 0x55, 0x84, 0x89, 0x58, 0x36, 0xe7}, + {0x0, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x4, 0xde, 0xc, 0x67, 0xb5, 0xb1, 0x63, 0x8, 0xda, 0xa1, 0x73, 0x18, 0xca, 0xce, 0x1c, 0x77, 0xa5, 0x7f, 0xad, 0xc6, 0x14, 0x10, 0xc2, 0xa9, 0x7b, 0x5f, 0x8d, 0xe6, 0x34, 0x30, 0xe2, 0x89, 0x5b, 0x81, 0x53, 0x38, 0xea, 0xee, 0x3c, 0x57, 0x85, 0xfe, 0x2c, 0x47, 0x95, 0x91, 0x43, 0x28, 0xfa, 0x20, 0xf2, 0x99, 0x4b, 0x4f, 0x9d, 0xf6, 0x24, 0xbe, 0x6c, 0x7, 0xd5, 0xd1, 0x3, 0x68, 0xba, 0x60, 0xb2, 0xd9, 0xb, 0xf, 0xdd, 0xb6, 0x64, 0x1f, 0xcd, 0xa6, 0x74, 0x70, 0xa2, 0xc9, 0x1b, 0xc1, 0x13, 0x78, 0xaa, 0xae, 0x7c, 0x17, 0xc5, 0xe1, 0x33, 0x58, 0x8a, 0x8e, 0x5c, 0x37, 0xe5, 0x3f, 0xed, 0x86, 0x54, 0x50, 0x82, 0xe9, 0x3b, 0x40, 0x92, 0xf9, 0x2b, 0x2f, 0xfd, 0x96, 0x44, 0x9e, 0x4c, 0x27, 0xf5, 0xf1, 0x23, 0x48, 0x9a, 0x61, 0xb3, 0xd8, 0xa, 0xe, 0xdc, 0xb7, 0x65, 0xbf, 0x6d, 0x6, 0xd4, 0xd0, 0x2, 0x69, 0xbb, 0xc0, 0x12, 0x79, 0xab, 0xaf, 0x7d, 0x16, 0xc4, 0x1e, 0xcc, 0xa7, 0x75, 0x71, 0xa3, 0xc8, 0x1a, 0x3e, 0xec, 0x87, 0x55, 0x51, 0x83, 0xe8, 0x3a, 0xe0, 0x32, 0x59, 0x8b, 0x8f, 0x5d, 0x36, 0xe4, 0x9f, 0x4d, 0x26, 0xf4, 0xf0, 0x22, 0x49, 0x9b, 0x41, 0x93, 0xf8, 0x2a, 0x2e, 0xfc, 0x97, 0x45, 0xdf, 0xd, 0x66, 0xb4, 0xb0, 0x62, 0x9, 0xdb, 0x1, 0xd3, 0xb8, 0x6a, 0x6e, 0xbc, 0xd7, 0x5, 0x7e, 0xac, 0xc7, 0x15, 0x11, 0xc3, 0xa8, 0x7a, 0xa0, 0x72, 0x19, 0xcb, 0xcf, 0x1d, 0x76, 0xa4, 0x80, 0x52, 0x39, 0xeb, 0xef, 0x3d, 0x56, 0x84, 0x5e, 0x8c, 0xe7, 0x35, 0x31, 0xe3, 0x88, 0x5a, 0x21, 0xf3, 0x98, 0x4a, 0x4e, 0x9c, 0xf7, 0x25, 0xff, 0x2d, 0x46, 0x94, 0x90, 0x42, 0x29, 0xfb}, + {0x0, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x3, 0xd6, 0x5, 0x6d, 0xbe, 0xbd, 0x6e, 0x6, 0xd5, 0xb1, 0x62, 0xa, 0xd9, 0xda, 0x9, 0x61, 0xb2, 0x67, 0xb4, 0xdc, 0xf, 0xc, 0xdf, 0xb7, 0x64, 0x7f, 0xac, 0xc4, 0x17, 0x14, 0xc7, 0xaf, 0x7c, 0xa9, 0x7a, 0x12, 0xc1, 0xc2, 0x11, 0x79, 0xaa, 0xce, 0x1d, 0x75, 0xa6, 0xa5, 0x76, 0x1e, 0xcd, 0x18, 0xcb, 0xa3, 0x70, 0x73, 0xa0, 0xc8, 0x1b, 0xfe, 0x2d, 0x45, 0x96, 0x95, 0x46, 0x2e, 0xfd, 0x28, 0xfb, 0x93, 0x40, 0x43, 0x90, 0xf8, 0x2b, 0x4f, 0x9c, 0xf4, 0x27, 0x24, 0xf7, 0x9f, 0x4c, 0x99, 0x4a, 0x22, 0xf1, 0xf2, 0x21, 0x49, 0x9a, 0x81, 0x52, 0x3a, 0xe9, 0xea, 0x39, 0x51, 0x82, 0x57, 0x84, 0xec, 0x3f, 0x3c, 0xef, 0x87, 0x54, 0x30, 0xe3, 0x8b, 0x58, 0x5b, 0x88, 0xe0, 0x33, 0xe6, 0x35, 0x5d, 0x8e, 0x8d, 0x5e, 0x36, 0xe5, 0xe1, 0x32, 0x5a, 0x89, 0x8a, 0x59, 0x31, 0xe2, 0x37, 0xe4, 0x8c, 0x5f, 0x5c, 0x8f, 0xe7, 0x34, 0x50, 0x83, 0xeb, 0x38, 0x3b, 0xe8, 0x80, 0x53, 0x86, 0x55, 0x3d, 0xee, 0xed, 0x3e, 0x56, 0x85, 0x9e, 0x4d, 0x25, 0xf6, 0xf5, 0x26, 0x4e, 0x9d, 0x48, 0x9b, 0xf3, 0x20, 0x23, 0xf0, 0x98, 0x4b, 0x2f, 0xfc, 0x94, 0x47, 0x44, 0x97, 0xff, 0x2c, 0xf9, 0x2a, 0x42, 0x91, 0x92, 0x41, 0x29, 0xfa, 0x1f, 0xcc, 0xa4, 0x77, 0x74, 0xa7, 0xcf, 0x1c, 0xc9, 0x1a, 0x72, 0xa1, 0xa2, 0x71, 0x19, 0xca, 0xae, 0x7d, 0x15, 0xc6, 0xc5, 0x16, 0x7e, 0xad, 0x78, 0xab, 0xc3, 0x10, 0x13, 0xc0, 0xa8, 0x7b, 0x60, 0xb3, 0xdb, 0x8, 0xb, 0xd8, 0xb0, 0x63, 0xb6, 0x65, 0xd, 0xde, 0xdd, 0xe, 0x66, 0xb5, 0xd1, 0x2, 0x6a, 0xb9, 0xba, 0x69, 0x1, 0xd2, 0x7, 0xd4, 0xbc, 0x6f, 0x6c, 0xbf, 0xd7, 0x4}, + {0x0, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8, 0xc1, 0x15, 0x74, 0xa0, 0xb6, 0x62, 0x3, 0xd7, 0x2f, 0xfb, 0x9a, 0x4e, 0x58, 0x8c, 0xed, 0x39, 0x9f, 0x4b, 0x2a, 0xfe, 0xe8, 0x3c, 0x5d, 0x89, 0x71, 0xa5, 0xc4, 0x10, 0x6, 0xd2, 0xb3, 0x67, 0x5e, 0x8a, 0xeb, 0x3f, 0x29, 0xfd, 0x9c, 0x48, 0xb0, 0x64, 0x5, 0xd1, 0xc7, 0x13, 0x72, 0xa6, 0x23, 0xf7, 0x96, 0x42, 0x54, 0x80, 0xe1, 0x35, 0xcd, 0x19, 0x78, 0xac, 0xba, 0x6e, 0xf, 0xdb, 0xe2, 0x36, 0x57, 0x83, 0x95, 0x41, 0x20, 0xf4, 0xc, 0xd8, 0xb9, 0x6d, 0x7b, 0xaf, 0xce, 0x1a, 0xbc, 0x68, 0x9, 0xdd, 0xcb, 0x1f, 0x7e, 0xaa, 0x52, 0x86, 0xe7, 0x33, 0x25, 0xf1, 0x90, 0x44, 0x7d, 0xa9, 0xc8, 0x1c, 0xa, 0xde, 0xbf, 0x6b, 0x93, 0x47, 0x26, 0xf2, 0xe4, 0x30, 0x51, 0x85, 0x46, 0x92, 0xf3, 0x27, 0x31, 0xe5, 0x84, 0x50, 0xa8, 0x7c, 0x1d, 0xc9, 0xdf, 0xb, 0x6a, 0xbe, 0x87, 0x53, 0x32, 0xe6, 0xf0, 0x24, 0x45, 0x91, 0x69, 0xbd, 0xdc, 0x8, 0x1e, 0xca, 0xab, 0x7f, 0xd9, 0xd, 0x6c, 0xb8, 0xae, 0x7a, 0x1b, 0xcf, 0x37, 0xe3, 0x82, 0x56, 0x40, 0x94, 0xf5, 0x21, 0x18, 0xcc, 0xad, 0x79, 0x6f, 0xbb, 0xda, 0xe, 0xf6, 0x22, 0x43, 0x97, 0x81, 0x55, 0x34, 0xe0, 0x65, 0xb1, 0xd0, 0x4, 0x12, 0xc6, 0xa7, 0x73, 0x8b, 0x5f, 0x3e, 0xea, 0xfc, 0x28, 0x49, 0x9d, 0xa4, 0x70, 0x11, 0xc5, 0xd3, 0x7, 0x66, 0xb2, 0x4a, 0x9e, 0xff, 0x2b, 0x3d, 0xe9, 0x88, 0x5c, 0xfa, 0x2e, 0x4f, 0x9b, 0x8d, 0x59, 0x38, 0xec, 0x14, 0xc0, 0xa1, 0x75, 0x63, 0xb7, 0xd6, 0x2, 0x3b, 0xef, 0x8e, 0x5a, 0x4c, 0x98, 0xf9, 0x2d, 0xd5, 0x1, 0x60, 0xb4, 0xa2, 0x76, 0x17, 0xc3}, + {0x0, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7, 0xd1, 0x4, 0x66, 0xb3, 0xa2, 0x77, 0x15, 0xc0, 0x37, 0xe2, 0x80, 0x55, 0x44, 0x91, 0xf3, 0x26, 0xbf, 0x6a, 0x8, 0xdd, 0xcc, 0x19, 0x7b, 0xae, 0x59, 0x8c, 0xee, 0x3b, 0x2a, 0xff, 0x9d, 0x48, 0x6e, 0xbb, 0xd9, 0xc, 0x1d, 0xc8, 0xaa, 0x7f, 0x88, 0x5d, 0x3f, 0xea, 0xfb, 0x2e, 0x4c, 0x99, 0x63, 0xb6, 0xd4, 0x1, 0x10, 0xc5, 0xa7, 0x72, 0x85, 0x50, 0x32, 0xe7, 0xf6, 0x23, 0x41, 0x94, 0xb2, 0x67, 0x5, 0xd0, 0xc1, 0x14, 0x76, 0xa3, 0x54, 0x81, 0xe3, 0x36, 0x27, 0xf2, 0x90, 0x45, 0xdc, 0x9, 0x6b, 0xbe, 0xaf, 0x7a, 0x18, 0xcd, 0x3a, 0xef, 0x8d, 0x58, 0x49, 0x9c, 0xfe, 0x2b, 0xd, 0xd8, 0xba, 0x6f, 0x7e, 0xab, 0xc9, 0x1c, 0xeb, 0x3e, 0x5c, 0x89, 0x98, 0x4d, 0x2f, 0xfa, 0xc6, 0x13, 0x71, 0xa4, 0xb5, 0x60, 0x2, 0xd7, 0x20, 0xf5, 0x97, 0x42, 0x53, 0x86, 0xe4, 0x31, 0x17, 0xc2, 0xa0, 0x75, 0x64, 0xb1, 0xd3, 0x6, 0xf1, 0x24, 0x46, 0x93, 0x82, 0x57, 0x35, 0xe0, 0x79, 0xac, 0xce, 0x1b, 0xa, 0xdf, 0xbd, 0x68, 0x9f, 0x4a, 0x28, 0xfd, 0xec, 0x39, 0x5b, 0x8e, 0xa8, 0x7d, 0x1f, 0xca, 0xdb, 0xe, 0x6c, 0xb9, 0x4e, 0x9b, 0xf9, 0x2c, 0x3d, 0xe8, 0x8a, 0x5f, 0xa5, 0x70, 0x12, 0xc7, 0xd6, 0x3, 0x61, 0xb4, 0x43, 0x96, 0xf4, 0x21, 0x30, 0xe5, 0x87, 0x52, 0x74, 0xa1, 0xc3, 0x16, 0x7, 0xd2, 0xb0, 0x65, 0x92, 0x47, 0x25, 0xf0, 0xe1, 0x34, 0x56, 0x83, 0x1a, 0xcf, 0xad, 0x78, 0x69, 0xbc, 0xde, 0xb, 0xfc, 0x29, 0x4b, 0x9e, 0x8f, 0x5a, 0x38, 0xed, 0xcb, 0x1e, 0x7c, 0xa9, 0xb8, 0x6d, 0xf, 0xda, 0x2d, 0xf8, 0x9a, 0x4f, 0x5e, 0x8b, 0xe9, 0x3c}, + {0x0, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6, 0xe1, 0x37, 0x50, 0x86, 0x9e, 0x48, 0x2f, 0xf9, 0x1f, 0xc9, 0xae, 0x78, 0x60, 0xb6, 0xd1, 0x7, 0xdf, 0x9, 0x6e, 0xb8, 0xa0, 0x76, 0x11, 0xc7, 0x21, 0xf7, 0x90, 0x46, 0x5e, 0x88, 0xef, 0x39, 0x3e, 0xe8, 0x8f, 0x59, 0x41, 0x97, 0xf0, 0x26, 0xc0, 0x16, 0x71, 0xa7, 0xbf, 0x69, 0xe, 0xd8, 0xa3, 0x75, 0x12, 0xc4, 0xdc, 0xa, 0x6d, 0xbb, 0x5d, 0x8b, 0xec, 0x3a, 0x22, 0xf4, 0x93, 0x45, 0x42, 0x94, 0xf3, 0x25, 0x3d, 0xeb, 0x8c, 0x5a, 0xbc, 0x6a, 0xd, 0xdb, 0xc3, 0x15, 0x72, 0xa4, 0x7c, 0xaa, 0xcd, 0x1b, 0x3, 0xd5, 0xb2, 0x64, 0x82, 0x54, 0x33, 0xe5, 0xfd, 0x2b, 0x4c, 0x9a, 0x9d, 0x4b, 0x2c, 0xfa, 0xe2, 0x34, 0x53, 0x85, 0x63, 0xb5, 0xd2, 0x4, 0x1c, 0xca, 0xad, 0x7b, 0x5b, 0x8d, 0xea, 0x3c, 0x24, 0xf2, 0x95, 0x43, 0xa5, 0x73, 0x14, 0xc2, 0xda, 0xc, 0x6b, 0xbd, 0xba, 0x6c, 0xb, 0xdd, 0xc5, 0x13, 0x74, 0xa2, 0x44, 0x92, 0xf5, 0x23, 0x3b, 0xed, 0x8a, 0x5c, 0x84, 0x52, 0x35, 0xe3, 0xfb, 0x2d, 0x4a, 0x9c, 0x7a, 0xac, 0xcb, 0x1d, 0x5, 0xd3, 0xb4, 0x62, 0x65, 0xb3, 0xd4, 0x2, 0x1a, 0xcc, 0xab, 0x7d, 0x9b, 0x4d, 0x2a, 0xfc, 0xe4, 0x32, 0x55, 0x83, 0xf8, 0x2e, 0x49, 0x9f, 0x87, 0x51, 0x36, 0xe0, 0x6, 0xd0, 0xb7, 0x61, 0x79, 0xaf, 0xc8, 0x1e, 0x19, 0xcf, 0xa8, 0x7e, 0x66, 0xb0, 0xd7, 0x1, 0xe7, 0x31, 0x56, 0x80, 0x98, 0x4e, 0x29, 0xff, 0x27, 0xf1, 0x96, 0x40, 0x58, 0x8e, 0xe9, 0x3f, 0xd9, 0xf, 0x68, 0xbe, 0xa6, 0x70, 0x17, 0xc1, 0xc6, 0x10, 0x77, 0xa1, 0xb9, 0x6f, 0x8, 0xde, 0x38, 0xee, 0x89, 0x5f, 0x47, 0x91, 0xf6, 0x20}, + {0x0, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9, 0xf1, 0x26, 0x42, 0x95, 0x8a, 0x5d, 0x39, 0xee, 0x7, 0xd0, 0xb4, 0x63, 0x7c, 0xab, 0xcf, 0x18, 0xff, 0x28, 0x4c, 0x9b, 0x84, 0x53, 0x37, 0xe0, 0x9, 0xde, 0xba, 0x6d, 0x72, 0xa5, 0xc1, 0x16, 0xe, 0xd9, 0xbd, 0x6a, 0x75, 0xa2, 0xc6, 0x11, 0xf8, 0x2f, 0x4b, 0x9c, 0x83, 0x54, 0x30, 0xe7, 0xe3, 0x34, 0x50, 0x87, 0x98, 0x4f, 0x2b, 0xfc, 0x15, 0xc2, 0xa6, 0x71, 0x6e, 0xb9, 0xdd, 0xa, 0x12, 0xc5, 0xa1, 0x76, 0x69, 0xbe, 0xda, 0xd, 0xe4, 0x33, 0x57, 0x80, 0x9f, 0x48, 0x2c, 0xfb, 0x1c, 0xcb, 0xaf, 0x78, 0x67, 0xb0, 0xd4, 0x3, 0xea, 0x3d, 0x59, 0x8e, 0x91, 0x46, 0x22, 0xf5, 0xed, 0x3a, 0x5e, 0x89, 0x96, 0x41, 0x25, 0xf2, 0x1b, 0xcc, 0xa8, 0x7f, 0x60, 0xb7, 0xd3, 0x4, 0xdb, 0xc, 0x68, 0xbf, 0xa0, 0x77, 0x13, 0xc4, 0x2d, 0xfa, 0x9e, 0x49, 0x56, 0x81, 0xe5, 0x32, 0x2a, 0xfd, 0x99, 0x4e, 0x51, 0x86, 0xe2, 0x35, 0xdc, 0xb, 0x6f, 0xb8, 0xa7, 0x70, 0x14, 0xc3, 0x24, 0xf3, 0x97, 0x40, 0x5f, 0x88, 0xec, 0x3b, 0xd2, 0x5, 0x61, 0xb6, 0xa9, 0x7e, 0x1a, 0xcd, 0xd5, 0x2, 0x66, 0xb1, 0xae, 0x79, 0x1d, 0xca, 0x23, 0xf4, 0x90, 0x47, 0x58, 0x8f, 0xeb, 0x3c, 0x38, 0xef, 0x8b, 0x5c, 0x43, 0x94, 0xf0, 0x27, 0xce, 0x19, 0x7d, 0xaa, 0xb5, 0x62, 0x6, 0xd1, 0xc9, 0x1e, 0x7a, 0xad, 0xb2, 0x65, 0x1, 0xd6, 0x3f, 0xe8, 0x8c, 0x5b, 0x44, 0x93, 0xf7, 0x20, 0xc7, 0x10, 0x74, 0xa3, 0xbc, 0x6b, 0xf, 0xd8, 0x31, 0xe6, 0x82, 0x55, 0x4a, 0x9d, 0xf9, 0x2e, 0x36, 0xe1, 0x85, 0x52, 0x4d, 0x9a, 0xfe, 0x29, 0xc0, 0x17, 0x73, 0xa4, 0xbb, 0x6c, 0x8, 0xdf}, + {0x0, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc, 0x1, 0xd9, 0xac, 0x74, 0x46, 0x9e, 0xeb, 0x33, 0x8f, 0x57, 0x22, 0xfa, 0xc8, 0x10, 0x65, 0xbd, 0x2, 0xda, 0xaf, 0x77, 0x45, 0x9d, 0xe8, 0x30, 0x8c, 0x54, 0x21, 0xf9, 0xcb, 0x13, 0x66, 0xbe, 0x3, 0xdb, 0xae, 0x76, 0x44, 0x9c, 0xe9, 0x31, 0x8d, 0x55, 0x20, 0xf8, 0xca, 0x12, 0x67, 0xbf, 0x4, 0xdc, 0xa9, 0x71, 0x43, 0x9b, 0xee, 0x36, 0x8a, 0x52, 0x27, 0xff, 0xcd, 0x15, 0x60, 0xb8, 0x5, 0xdd, 0xa8, 0x70, 0x42, 0x9a, 0xef, 0x37, 0x8b, 0x53, 0x26, 0xfe, 0xcc, 0x14, 0x61, 0xb9, 0x6, 0xde, 0xab, 0x73, 0x41, 0x99, 0xec, 0x34, 0x88, 0x50, 0x25, 0xfd, 0xcf, 0x17, 0x62, 0xba, 0x7, 0xdf, 0xaa, 0x72, 0x40, 0x98, 0xed, 0x35, 0x89, 0x51, 0x24, 0xfc, 0xce, 0x16, 0x63, 0xbb, 0x8, 0xd0, 0xa5, 0x7d, 0x4f, 0x97, 0xe2, 0x3a, 0x86, 0x5e, 0x2b, 0xf3, 0xc1, 0x19, 0x6c, 0xb4, 0x9, 0xd1, 0xa4, 0x7c, 0x4e, 0x96, 0xe3, 0x3b, 0x87, 0x5f, 0x2a, 0xf2, 0xc0, 0x18, 0x6d, 0xb5, 0xa, 0xd2, 0xa7, 0x7f, 0x4d, 0x95, 0xe0, 0x38, 0x84, 0x5c, 0x29, 0xf1, 0xc3, 0x1b, 0x6e, 0xb6, 0xb, 0xd3, 0xa6, 0x7e, 0x4c, 0x94, 0xe1, 0x39, 0x85, 0x5d, 0x28, 0xf0, 0xc2, 0x1a, 0x6f, 0xb7, 0xc, 0xd4, 0xa1, 0x79, 0x4b, 0x93, 0xe6, 0x3e, 0x82, 0x5a, 0x2f, 0xf7, 0xc5, 0x1d, 0x68, 0xb0, 0xd, 0xd5, 0xa0, 0x78, 0x4a, 0x92, 0xe7, 0x3f, 0x83, 0x5b, 0x2e, 0xf6, 0xc4, 0x1c, 0x69, 0xb1, 0xe, 0xd6, 0xa3, 0x7b, 0x49, 0x91, 0xe4, 0x3c, 0x80, 0x58, 0x2d, 0xf5, 0xc7, 0x1f, 0x6a, 0xb2, 0xf, 0xd7, 0xa2, 0x7a, 0x48, 0x90, 0xe5, 0x3d, 0x81, 0x59, 0x2c, 0xf4, 0xc6, 0x1e, 0x6b, 0xb3}, + {0x0, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3, 0x11, 0xc8, 0xbe, 0x67, 0x52, 0x8b, 0xfd, 0x24, 0x97, 0x4e, 0x38, 0xe1, 0xd4, 0xd, 0x7b, 0xa2, 0x22, 0xfb, 0x8d, 0x54, 0x61, 0xb8, 0xce, 0x17, 0xa4, 0x7d, 0xb, 0xd2, 0xe7, 0x3e, 0x48, 0x91, 0x33, 0xea, 0x9c, 0x45, 0x70, 0xa9, 0xdf, 0x6, 0xb5, 0x6c, 0x1a, 0xc3, 0xf6, 0x2f, 0x59, 0x80, 0x44, 0x9d, 0xeb, 0x32, 0x7, 0xde, 0xa8, 0x71, 0xc2, 0x1b, 0x6d, 0xb4, 0x81, 0x58, 0x2e, 0xf7, 0x55, 0x8c, 0xfa, 0x23, 0x16, 0xcf, 0xb9, 0x60, 0xd3, 0xa, 0x7c, 0xa5, 0x90, 0x49, 0x3f, 0xe6, 0x66, 0xbf, 0xc9, 0x10, 0x25, 0xfc, 0x8a, 0x53, 0xe0, 0x39, 0x4f, 0x96, 0xa3, 0x7a, 0xc, 0xd5, 0x77, 0xae, 0xd8, 0x1, 0x34, 0xed, 0x9b, 0x42, 0xf1, 0x28, 0x5e, 0x87, 0xb2, 0x6b, 0x1d, 0xc4, 0x88, 0x51, 0x27, 0xfe, 0xcb, 0x12, 0x64, 0xbd, 0xe, 0xd7, 0xa1, 0x78, 0x4d, 0x94, 0xe2, 0x3b, 0x99, 0x40, 0x36, 0xef, 0xda, 0x3, 0x75, 0xac, 0x1f, 0xc6, 0xb0, 0x69, 0x5c, 0x85, 0xf3, 0x2a, 0xaa, 0x73, 0x5, 0xdc, 0xe9, 0x30, 0x46, 0x9f, 0x2c, 0xf5, 0x83, 0x5a, 0x6f, 0xb6, 0xc0, 0x19, 0xbb, 0x62, 0x14, 0xcd, 0xf8, 0x21, 0x57, 0x8e, 0x3d, 0xe4, 0x92, 0x4b, 0x7e, 0xa7, 0xd1, 0x8, 0xcc, 0x15, 0x63, 0xba, 0x8f, 0x56, 0x20, 0xf9, 0x4a, 0x93, 0xe5, 0x3c, 0x9, 0xd0, 0xa6, 0x7f, 0xdd, 0x4, 0x72, 0xab, 0x9e, 0x47, 0x31, 0xe8, 0x5b, 0x82, 0xf4, 0x2d, 0x18, 0xc1, 0xb7, 0x6e, 0xee, 0x37, 0x41, 0x98, 0xad, 0x74, 0x2, 0xdb, 0x68, 0xb1, 0xc7, 0x1e, 0x2b, 0xf2, 0x84, 0x5d, 0xff, 0x26, 0x50, 0x89, 0xbc, 0x65, 0x13, 0xca, 0x79, 0xa0, 0xd6, 0xf, 0x3a, 0xe3, 0x95, 0x4c}, + {0x0, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0xb, 0x78, 0xa2, 0x21, 0xfb, 0x88, 0x52, 0x6e, 0xb4, 0xc7, 0x1d, 0xbf, 0x65, 0x16, 0xcc, 0xf0, 0x2a, 0x59, 0x83, 0x42, 0x98, 0xeb, 0x31, 0xd, 0xd7, 0xa4, 0x7e, 0xdc, 0x6, 0x75, 0xaf, 0x93, 0x49, 0x3a, 0xe0, 0x63, 0xb9, 0xca, 0x10, 0x2c, 0xf6, 0x85, 0x5f, 0xfd, 0x27, 0x54, 0x8e, 0xb2, 0x68, 0x1b, 0xc1, 0x84, 0x5e, 0x2d, 0xf7, 0xcb, 0x11, 0x62, 0xb8, 0x1a, 0xc0, 0xb3, 0x69, 0x55, 0x8f, 0xfc, 0x26, 0xa5, 0x7f, 0xc, 0xd6, 0xea, 0x30, 0x43, 0x99, 0x3b, 0xe1, 0x92, 0x48, 0x74, 0xae, 0xdd, 0x7, 0xc6, 0x1c, 0x6f, 0xb5, 0x89, 0x53, 0x20, 0xfa, 0x58, 0x82, 0xf1, 0x2b, 0x17, 0xcd, 0xbe, 0x64, 0xe7, 0x3d, 0x4e, 0x94, 0xa8, 0x72, 0x1, 0xdb, 0x79, 0xa3, 0xd0, 0xa, 0x36, 0xec, 0x9f, 0x45, 0x15, 0xcf, 0xbc, 0x66, 0x5a, 0x80, 0xf3, 0x29, 0x8b, 0x51, 0x22, 0xf8, 0xc4, 0x1e, 0x6d, 0xb7, 0x34, 0xee, 0x9d, 0x47, 0x7b, 0xa1, 0xd2, 0x8, 0xaa, 0x70, 0x3, 0xd9, 0xe5, 0x3f, 0x4c, 0x96, 0x57, 0x8d, 0xfe, 0x24, 0x18, 0xc2, 0xb1, 0x6b, 0xc9, 0x13, 0x60, 0xba, 0x86, 0x5c, 0x2f, 0xf5, 0x76, 0xac, 0xdf, 0x5, 0x39, 0xe3, 0x90, 0x4a, 0xe8, 0x32, 0x41, 0x9b, 0xa7, 0x7d, 0xe, 0xd4, 0x91, 0x4b, 0x38, 0xe2, 0xde, 0x4, 0x77, 0xad, 0xf, 0xd5, 0xa6, 0x7c, 0x40, 0x9a, 0xe9, 0x33, 0xb0, 0x6a, 0x19, 0xc3, 0xff, 0x25, 0x56, 0x8c, 0x2e, 0xf4, 0x87, 0x5d, 0x61, 0xbb, 0xc8, 0x12, 0xd3, 0x9, 0x7a, 0xa0, 0x9c, 0x46, 0x35, 0xef, 0x4d, 0x97, 0xe4, 0x3e, 0x2, 0xd8, 0xab, 0x71, 0xf2, 0x28, 0x5b, 0x81, 0xbd, 0x67, 0x14, 0xce, 0x6c, 0xb6, 0xc5, 0x1f, 0x23, 0xf9, 0x8a, 0x50}, + {0x0, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x6, 0x76, 0xad, 0x31, 0xea, 0x9a, 0x41, 0x7a, 0xa1, 0xd1, 0xa, 0xa7, 0x7c, 0xc, 0xd7, 0xec, 0x37, 0x47, 0x9c, 0x62, 0xb9, 0xc9, 0x12, 0x29, 0xf2, 0x82, 0x59, 0xf4, 0x2f, 0x5f, 0x84, 0xbf, 0x64, 0x14, 0xcf, 0x53, 0x88, 0xf8, 0x23, 0x18, 0xc3, 0xb3, 0x68, 0xc5, 0x1e, 0x6e, 0xb5, 0x8e, 0x55, 0x25, 0xfe, 0xc4, 0x1f, 0x6f, 0xb4, 0x8f, 0x54, 0x24, 0xff, 0x52, 0x89, 0xf9, 0x22, 0x19, 0xc2, 0xb2, 0x69, 0xf5, 0x2e, 0x5e, 0x85, 0xbe, 0x65, 0x15, 0xce, 0x63, 0xb8, 0xc8, 0x13, 0x28, 0xf3, 0x83, 0x58, 0xa6, 0x7d, 0xd, 0xd6, 0xed, 0x36, 0x46, 0x9d, 0x30, 0xeb, 0x9b, 0x40, 0x7b, 0xa0, 0xd0, 0xb, 0x97, 0x4c, 0x3c, 0xe7, 0xdc, 0x7, 0x77, 0xac, 0x1, 0xda, 0xaa, 0x71, 0x4a, 0x91, 0xe1, 0x3a, 0x95, 0x4e, 0x3e, 0xe5, 0xde, 0x5, 0x75, 0xae, 0x3, 0xd8, 0xa8, 0x73, 0x48, 0x93, 0xe3, 0x38, 0xa4, 0x7f, 0xf, 0xd4, 0xef, 0x34, 0x44, 0x9f, 0x32, 0xe9, 0x99, 0x42, 0x79, 0xa2, 0xd2, 0x9, 0xf7, 0x2c, 0x5c, 0x87, 0xbc, 0x67, 0x17, 0xcc, 0x61, 0xba, 0xca, 0x11, 0x2a, 0xf1, 0x81, 0x5a, 0xc6, 0x1d, 0x6d, 0xb6, 0x8d, 0x56, 0x26, 0xfd, 0x50, 0x8b, 0xfb, 0x20, 0x1b, 0xc0, 0xb0, 0x6b, 0x51, 0x8a, 0xfa, 0x21, 0x1a, 0xc1, 0xb1, 0x6a, 0xc7, 0x1c, 0x6c, 0xb7, 0x8c, 0x57, 0x27, 0xfc, 0x60, 0xbb, 0xcb, 0x10, 0x2b, 0xf0, 0x80, 0x5b, 0xf6, 0x2d, 0x5d, 0x86, 0xbd, 0x66, 0x16, 0xcd, 0x33, 0xe8, 0x98, 0x43, 0x78, 0xa3, 0xd3, 0x8, 0xa5, 0x7e, 0xe, 0xd5, 0xee, 0x35, 0x45, 0x9e, 0x2, 0xd9, 0xa9, 0x72, 0x49, 0x92, 0xe2, 0x39, 0x94, 0x4f, 0x3f, 0xe4, 0xdf, 0x4, 0x74, 0xaf}, + {0x0, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0xb, 0xd7, 0xf9, 0x25, 0x5c, 0x80, 0x41, 0x9d, 0xe4, 0x38, 0x16, 0xca, 0xb3, 0x6f, 0xef, 0x33, 0x4a, 0x96, 0xb8, 0x64, 0x1d, 0xc1, 0x82, 0x5e, 0x27, 0xfb, 0xd5, 0x9, 0x70, 0xac, 0x2c, 0xf0, 0x89, 0x55, 0x7b, 0xa7, 0xde, 0x2, 0xc3, 0x1f, 0x66, 0xba, 0x94, 0x48, 0x31, 0xed, 0x6d, 0xb1, 0xc8, 0x14, 0x3a, 0xe6, 0x9f, 0x43, 0x19, 0xc5, 0xbc, 0x60, 0x4e, 0x92, 0xeb, 0x37, 0xb7, 0x6b, 0x12, 0xce, 0xe0, 0x3c, 0x45, 0x99, 0x58, 0x84, 0xfd, 0x21, 0xf, 0xd3, 0xaa, 0x76, 0xf6, 0x2a, 0x53, 0x8f, 0xa1, 0x7d, 0x4, 0xd8, 0x9b, 0x47, 0x3e, 0xe2, 0xcc, 0x10, 0x69, 0xb5, 0x35, 0xe9, 0x90, 0x4c, 0x62, 0xbe, 0xc7, 0x1b, 0xda, 0x6, 0x7f, 0xa3, 0x8d, 0x51, 0x28, 0xf4, 0x74, 0xa8, 0xd1, 0xd, 0x23, 0xff, 0x86, 0x5a, 0x32, 0xee, 0x97, 0x4b, 0x65, 0xb9, 0xc0, 0x1c, 0x9c, 0x40, 0x39, 0xe5, 0xcb, 0x17, 0x6e, 0xb2, 0x73, 0xaf, 0xd6, 0xa, 0x24, 0xf8, 0x81, 0x5d, 0xdd, 0x1, 0x78, 0xa4, 0x8a, 0x56, 0x2f, 0xf3, 0xb0, 0x6c, 0x15, 0xc9, 0xe7, 0x3b, 0x42, 0x9e, 0x1e, 0xc2, 0xbb, 0x67, 0x49, 0x95, 0xec, 0x30, 0xf1, 0x2d, 0x54, 0x88, 0xa6, 0x7a, 0x3, 0xdf, 0x5f, 0x83, 0xfa, 0x26, 0x8, 0xd4, 0xad, 0x71, 0x2b, 0xf7, 0x8e, 0x52, 0x7c, 0xa0, 0xd9, 0x5, 0x85, 0x59, 0x20, 0xfc, 0xd2, 0xe, 0x77, 0xab, 0x6a, 0xb6, 0xcf, 0x13, 0x3d, 0xe1, 0x98, 0x44, 0xc4, 0x18, 0x61, 0xbd, 0x93, 0x4f, 0x36, 0xea, 0xa9, 0x75, 0xc, 0xd0, 0xfe, 0x22, 0x5b, 0x87, 0x7, 0xdb, 0xa2, 0x7e, 0x50, 0x8c, 0xf5, 0x29, 0xe8, 0x34, 0x4d, 0x91, 0xbf, 0x63, 0x1a, 0xc6, 0x46, 0x9a, 0xe3, 0x3f, 0x11, 0xcd, 0xb4, 0x68}, + {0x0, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x1, 0xdc, 0xf5, 0x28, 0x52, 0x8f, 0x51, 0x8c, 0xf6, 0x2b, 0x2, 0xdf, 0xa5, 0x78, 0xf7, 0x2a, 0x50, 0x8d, 0xa4, 0x79, 0x3, 0xde, 0xa2, 0x7f, 0x5, 0xd8, 0xf1, 0x2c, 0x56, 0x8b, 0x4, 0xd9, 0xa3, 0x7e, 0x57, 0x8a, 0xf0, 0x2d, 0xf3, 0x2e, 0x54, 0x89, 0xa0, 0x7d, 0x7, 0xda, 0x55, 0x88, 0xf2, 0x2f, 0x6, 0xdb, 0xa1, 0x7c, 0x59, 0x84, 0xfe, 0x23, 0xa, 0xd7, 0xad, 0x70, 0xff, 0x22, 0x58, 0x85, 0xac, 0x71, 0xb, 0xd6, 0x8, 0xd5, 0xaf, 0x72, 0x5b, 0x86, 0xfc, 0x21, 0xae, 0x73, 0x9, 0xd4, 0xfd, 0x20, 0x5a, 0x87, 0xfb, 0x26, 0x5c, 0x81, 0xa8, 0x75, 0xf, 0xd2, 0x5d, 0x80, 0xfa, 0x27, 0xe, 0xd3, 0xa9, 0x74, 0xaa, 0x77, 0xd, 0xd0, 0xf9, 0x24, 0x5e, 0x83, 0xc, 0xd1, 0xab, 0x76, 0x5f, 0x82, 0xf8, 0x25, 0xb2, 0x6f, 0x15, 0xc8, 0xe1, 0x3c, 0x46, 0x9b, 0x14, 0xc9, 0xb3, 0x6e, 0x47, 0x9a, 0xe0, 0x3d, 0xe3, 0x3e, 0x44, 0x99, 0xb0, 0x6d, 0x17, 0xca, 0x45, 0x98, 0xe2, 0x3f, 0x16, 0xcb, 0xb1, 0x6c, 0x10, 0xcd, 0xb7, 0x6a, 0x43, 0x9e, 0xe4, 0x39, 0xb6, 0x6b, 0x11, 0xcc, 0xe5, 0x38, 0x42, 0x9f, 0x41, 0x9c, 0xe6, 0x3b, 0x12, 0xcf, 0xb5, 0x68, 0xe7, 0x3a, 0x40, 0x9d, 0xb4, 0x69, 0x13, 0xce, 0xeb, 0x36, 0x4c, 0x91, 0xb8, 0x65, 0x1f, 0xc2, 0x4d, 0x90, 0xea, 0x37, 0x1e, 0xc3, 0xb9, 0x64, 0xba, 0x67, 0x1d, 0xc0, 0xe9, 0x34, 0x4e, 0x93, 0x1c, 0xc1, 0xbb, 0x66, 0x4f, 0x92, 0xe8, 0x35, 0x49, 0x94, 0xee, 0x33, 0x1a, 0xc7, 0xbd, 0x60, 0xef, 0x32, 0x48, 0x95, 0xbc, 0x61, 0x1b, 0xc6, 0x18, 0xc5, 0xbf, 0x62, 0x4b, 0x96, 0xec, 0x31, 0xbe, 0x63, 0x19, 0xc4, 0xed, 0x30, 0x4a, 0x97}, + {0x0, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e, 0x61, 0xbf, 0xc0, 0x1e, 0x3e, 0xe0, 0x9f, 0x41, 0xdf, 0x1, 0x7e, 0xa0, 0x80, 0x5e, 0x21, 0xff, 0xc2, 0x1c, 0x63, 0xbd, 0x9d, 0x43, 0x3c, 0xe2, 0x7c, 0xa2, 0xdd, 0x3, 0x23, 0xfd, 0x82, 0x5c, 0xa3, 0x7d, 0x2, 0xdc, 0xfc, 0x22, 0x5d, 0x83, 0x1d, 0xc3, 0xbc, 0x62, 0x42, 0x9c, 0xe3, 0x3d, 0x99, 0x47, 0x38, 0xe6, 0xc6, 0x18, 0x67, 0xb9, 0x27, 0xf9, 0x86, 0x58, 0x78, 0xa6, 0xd9, 0x7, 0xf8, 0x26, 0x59, 0x87, 0xa7, 0x79, 0x6, 0xd8, 0x46, 0x98, 0xe7, 0x39, 0x19, 0xc7, 0xb8, 0x66, 0x5b, 0x85, 0xfa, 0x24, 0x4, 0xda, 0xa5, 0x7b, 0xe5, 0x3b, 0x44, 0x9a, 0xba, 0x64, 0x1b, 0xc5, 0x3a, 0xe4, 0x9b, 0x45, 0x65, 0xbb, 0xc4, 0x1a, 0x84, 0x5a, 0x25, 0xfb, 0xdb, 0x5, 0x7a, 0xa4, 0x2f, 0xf1, 0x8e, 0x50, 0x70, 0xae, 0xd1, 0xf, 0x91, 0x4f, 0x30, 0xee, 0xce, 0x10, 0x6f, 0xb1, 0x4e, 0x90, 0xef, 0x31, 0x11, 0xcf, 0xb0, 0x6e, 0xf0, 0x2e, 0x51, 0x8f, 0xaf, 0x71, 0xe, 0xd0, 0xed, 0x33, 0x4c, 0x92, 0xb2, 0x6c, 0x13, 0xcd, 0x53, 0x8d, 0xf2, 0x2c, 0xc, 0xd2, 0xad, 0x73, 0x8c, 0x52, 0x2d, 0xf3, 0xd3, 0xd, 0x72, 0xac, 0x32, 0xec, 0x93, 0x4d, 0x6d, 0xb3, 0xcc, 0x12, 0xb6, 0x68, 0x17, 0xc9, 0xe9, 0x37, 0x48, 0x96, 0x8, 0xd6, 0xa9, 0x77, 0x57, 0x89, 0xf6, 0x28, 0xd7, 0x9, 0x76, 0xa8, 0x88, 0x56, 0x29, 0xf7, 0x69, 0xb7, 0xc8, 0x16, 0x36, 0xe8, 0x97, 0x49, 0x74, 0xaa, 0xd5, 0xb, 0x2b, 0xf5, 0x8a, 0x54, 0xca, 0x14, 0x6b, 0xb5, 0x95, 0x4b, 0x34, 0xea, 0x15, 0xcb, 0xb4, 0x6a, 0x4a, 0x94, 0xeb, 0x35, 0xab, 0x75, 0xa, 0xd4, 0xf4, 0x2a, 0x55, 0x8b}, + {0x0, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91, 0x71, 0xae, 0xd2, 0xd, 0x2a, 0xf5, 0x89, 0x56, 0xc7, 0x18, 0x64, 0xbb, 0x9c, 0x43, 0x3f, 0xe0, 0xe2, 0x3d, 0x41, 0x9e, 0xb9, 0x66, 0x1a, 0xc5, 0x54, 0x8b, 0xf7, 0x28, 0xf, 0xd0, 0xac, 0x73, 0x93, 0x4c, 0x30, 0xef, 0xc8, 0x17, 0x6b, 0xb4, 0x25, 0xfa, 0x86, 0x59, 0x7e, 0xa1, 0xdd, 0x2, 0xd9, 0x6, 0x7a, 0xa5, 0x82, 0x5d, 0x21, 0xfe, 0x6f, 0xb0, 0xcc, 0x13, 0x34, 0xeb, 0x97, 0x48, 0xa8, 0x77, 0xb, 0xd4, 0xf3, 0x2c, 0x50, 0x8f, 0x1e, 0xc1, 0xbd, 0x62, 0x45, 0x9a, 0xe6, 0x39, 0x3b, 0xe4, 0x98, 0x47, 0x60, 0xbf, 0xc3, 0x1c, 0x8d, 0x52, 0x2e, 0xf1, 0xd6, 0x9, 0x75, 0xaa, 0x4a, 0x95, 0xe9, 0x36, 0x11, 0xce, 0xb2, 0x6d, 0xfc, 0x23, 0x5f, 0x80, 0xa7, 0x78, 0x4, 0xdb, 0xaf, 0x70, 0xc, 0xd3, 0xf4, 0x2b, 0x57, 0x88, 0x19, 0xc6, 0xba, 0x65, 0x42, 0x9d, 0xe1, 0x3e, 0xde, 0x1, 0x7d, 0xa2, 0x85, 0x5a, 0x26, 0xf9, 0x68, 0xb7, 0xcb, 0x14, 0x33, 0xec, 0x90, 0x4f, 0x4d, 0x92, 0xee, 0x31, 0x16, 0xc9, 0xb5, 0x6a, 0xfb, 0x24, 0x58, 0x87, 0xa0, 0x7f, 0x3, 0xdc, 0x3c, 0xe3, 0x9f, 0x40, 0x67, 0xb8, 0xc4, 0x1b, 0x8a, 0x55, 0x29, 0xf6, 0xd1, 0xe, 0x72, 0xad, 0x76, 0xa9, 0xd5, 0xa, 0x2d, 0xf2, 0x8e, 0x51, 0xc0, 0x1f, 0x63, 0xbc, 0x9b, 0x44, 0x38, 0xe7, 0x7, 0xd8, 0xa4, 0x7b, 0x5c, 0x83, 0xff, 0x20, 0xb1, 0x6e, 0x12, 0xcd, 0xea, 0x35, 0x49, 0x96, 0x94, 0x4b, 0x37, 0xe8, 0xcf, 0x10, 0x6c, 0xb3, 0x22, 0xfd, 0x81, 0x5e, 0x79, 0xa6, 0xda, 0x5, 0xe5, 0x3a, 0x46, 0x99, 0xbe, 0x61, 0x1d, 0xc2, 0x53, 0x8c, 0xf0, 0x2f, 0x8, 0xd7, 0xab, 0x74}, + {0x0, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9, 0xa6, 0x46, 0x7b, 0x9b, 0x1, 0xe1, 0xdc, 0x3c, 0xf5, 0x15, 0x28, 0xc8, 0x52, 0xb2, 0x8f, 0x6f, 0x51, 0xb1, 0x8c, 0x6c, 0xf6, 0x16, 0x2b, 0xcb, 0x2, 0xe2, 0xdf, 0x3f, 0xa5, 0x45, 0x78, 0x98, 0xf7, 0x17, 0x2a, 0xca, 0x50, 0xb0, 0x8d, 0x6d, 0xa4, 0x44, 0x79, 0x99, 0x3, 0xe3, 0xde, 0x3e, 0xa2, 0x42, 0x7f, 0x9f, 0x5, 0xe5, 0xd8, 0x38, 0xf1, 0x11, 0x2c, 0xcc, 0x56, 0xb6, 0x8b, 0x6b, 0x4, 0xe4, 0xd9, 0x39, 0xa3, 0x43, 0x7e, 0x9e, 0x57, 0xb7, 0x8a, 0x6a, 0xf0, 0x10, 0x2d, 0xcd, 0xf3, 0x13, 0x2e, 0xce, 0x54, 0xb4, 0x89, 0x69, 0xa0, 0x40, 0x7d, 0x9d, 0x7, 0xe7, 0xda, 0x3a, 0x55, 0xb5, 0x88, 0x68, 0xf2, 0x12, 0x2f, 0xcf, 0x6, 0xe6, 0xdb, 0x3b, 0xa1, 0x41, 0x7c, 0x9c, 0x59, 0xb9, 0x84, 0x64, 0xfe, 0x1e, 0x23, 0xc3, 0xa, 0xea, 0xd7, 0x37, 0xad, 0x4d, 0x70, 0x90, 0xff, 0x1f, 0x22, 0xc2, 0x58, 0xb8, 0x85, 0x65, 0xac, 0x4c, 0x71, 0x91, 0xb, 0xeb, 0xd6, 0x36, 0x8, 0xe8, 0xd5, 0x35, 0xaf, 0x4f, 0x72, 0x92, 0x5b, 0xbb, 0x86, 0x66, 0xfc, 0x1c, 0x21, 0xc1, 0xae, 0x4e, 0x73, 0x93, 0x9, 0xe9, 0xd4, 0x34, 0xfd, 0x1d, 0x20, 0xc0, 0x5a, 0xba, 0x87, 0x67, 0xfb, 0x1b, 0x26, 0xc6, 0x5c, 0xbc, 0x81, 0x61, 0xa8, 0x48, 0x75, 0x95, 0xf, 0xef, 0xd2, 0x32, 0x5d, 0xbd, 0x80, 0x60, 0xfa, 0x1a, 0x27, 0xc7, 0xe, 0xee, 0xd3, 0x33, 0xa9, 0x49, 0x74, 0x94, 0xaa, 0x4a, 0x77, 0x97, 0xd, 0xed, 0xd0, 0x30, 0xf9, 0x19, 0x24, 0xc4, 0x5e, 0xbe, 0x83, 0x63, 0xc, 0xec, 0xd1, 0x31, 0xab, 0x4b, 0x76, 0x96, 0x5f, 0xbf, 0x82, 0x62, 0xf8, 0x18, 0x25, 0xc5}, + {0x0, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6, 0xb6, 0x57, 0x69, 0x88, 0x15, 0xf4, 0xca, 0x2b, 0xed, 0xc, 0x32, 0xd3, 0x4e, 0xaf, 0x91, 0x70, 0x71, 0x90, 0xae, 0x4f, 0xd2, 0x33, 0xd, 0xec, 0x2a, 0xcb, 0xf5, 0x14, 0x89, 0x68, 0x56, 0xb7, 0xc7, 0x26, 0x18, 0xf9, 0x64, 0x85, 0xbb, 0x5a, 0x9c, 0x7d, 0x43, 0xa2, 0x3f, 0xde, 0xe0, 0x1, 0xe2, 0x3, 0x3d, 0xdc, 0x41, 0xa0, 0x9e, 0x7f, 0xb9, 0x58, 0x66, 0x87, 0x1a, 0xfb, 0xc5, 0x24, 0x54, 0xb5, 0x8b, 0x6a, 0xf7, 0x16, 0x28, 0xc9, 0xf, 0xee, 0xd0, 0x31, 0xac, 0x4d, 0x73, 0x92, 0x93, 0x72, 0x4c, 0xad, 0x30, 0xd1, 0xef, 0xe, 0xc8, 0x29, 0x17, 0xf6, 0x6b, 0x8a, 0xb4, 0x55, 0x25, 0xc4, 0xfa, 0x1b, 0x86, 0x67, 0x59, 0xb8, 0x7e, 0x9f, 0xa1, 0x40, 0xdd, 0x3c, 0x2, 0xe3, 0xd9, 0x38, 0x6, 0xe7, 0x7a, 0x9b, 0xa5, 0x44, 0x82, 0x63, 0x5d, 0xbc, 0x21, 0xc0, 0xfe, 0x1f, 0x6f, 0x8e, 0xb0, 0x51, 0xcc, 0x2d, 0x13, 0xf2, 0x34, 0xd5, 0xeb, 0xa, 0x97, 0x76, 0x48, 0xa9, 0xa8, 0x49, 0x77, 0x96, 0xb, 0xea, 0xd4, 0x35, 0xf3, 0x12, 0x2c, 0xcd, 0x50, 0xb1, 0x8f, 0x6e, 0x1e, 0xff, 0xc1, 0x20, 0xbd, 0x5c, 0x62, 0x83, 0x45, 0xa4, 0x9a, 0x7b, 0xe6, 0x7, 0x39, 0xd8, 0x3b, 0xda, 0xe4, 0x5, 0x98, 0x79, 0x47, 0xa6, 0x60, 0x81, 0xbf, 0x5e, 0xc3, 0x22, 0x1c, 0xfd, 0x8d, 0x6c, 0x52, 0xb3, 0x2e, 0xcf, 0xf1, 0x10, 0xd6, 0x37, 0x9, 0xe8, 0x75, 0x94, 0xaa, 0x4b, 0x4a, 0xab, 0x95, 0x74, 0xe9, 0x8, 0x36, 0xd7, 0x11, 0xf0, 0xce, 0x2f, 0xb2, 0x53, 0x6d, 0x8c, 0xfc, 0x1d, 0x23, 0xc2, 0x5f, 0xbe, 0x80, 0x61, 0xa7, 0x46, 0x78, 0x99, 0x4, 0xe5, 0xdb, 0x3a}, + {0x0, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0xe, 0x35, 0xd7, 0x86, 0x64, 0x5f, 0xbd, 0x29, 0xcb, 0xf0, 0x12, 0xc5, 0x27, 0x1c, 0xfe, 0x6a, 0x88, 0xb3, 0x51, 0x11, 0xf3, 0xc8, 0x2a, 0xbe, 0x5c, 0x67, 0x85, 0x52, 0xb0, 0x8b, 0x69, 0xfd, 0x1f, 0x24, 0xc6, 0x97, 0x75, 0x4e, 0xac, 0x38, 0xda, 0xe1, 0x3, 0xd4, 0x36, 0xd, 0xef, 0x7b, 0x99, 0xa2, 0x40, 0x22, 0xc0, 0xfb, 0x19, 0x8d, 0x6f, 0x54, 0xb6, 0x61, 0x83, 0xb8, 0x5a, 0xce, 0x2c, 0x17, 0xf5, 0xa4, 0x46, 0x7d, 0x9f, 0xb, 0xe9, 0xd2, 0x30, 0xe7, 0x5, 0x3e, 0xdc, 0x48, 0xaa, 0x91, 0x73, 0x33, 0xd1, 0xea, 0x8, 0x9c, 0x7e, 0x45, 0xa7, 0x70, 0x92, 0xa9, 0x4b, 0xdf, 0x3d, 0x6, 0xe4, 0xb5, 0x57, 0x6c, 0x8e, 0x1a, 0xf8, 0xc3, 0x21, 0xf6, 0x14, 0x2f, 0xcd, 0x59, 0xbb, 0x80, 0x62, 0x44, 0xa6, 0x9d, 0x7f, 0xeb, 0x9, 0x32, 0xd0, 0x7, 0xe5, 0xde, 0x3c, 0xa8, 0x4a, 0x71, 0x93, 0xc2, 0x20, 0x1b, 0xf9, 0x6d, 0x8f, 0xb4, 0x56, 0x81, 0x63, 0x58, 0xba, 0x2e, 0xcc, 0xf7, 0x15, 0x55, 0xb7, 0x8c, 0x6e, 0xfa, 0x18, 0x23, 0xc1, 0x16, 0xf4, 0xcf, 0x2d, 0xb9, 0x5b, 0x60, 0x82, 0xd3, 0x31, 0xa, 0xe8, 0x7c, 0x9e, 0xa5, 0x47, 0x90, 0x72, 0x49, 0xab, 0x3f, 0xdd, 0xe6, 0x4, 0x66, 0x84, 0xbf, 0x5d, 0xc9, 0x2b, 0x10, 0xf2, 0x25, 0xc7, 0xfc, 0x1e, 0x8a, 0x68, 0x53, 0xb1, 0xe0, 0x2, 0x39, 0xdb, 0x4f, 0xad, 0x96, 0x74, 0xa3, 0x41, 0x7a, 0x98, 0xc, 0xee, 0xd5, 0x37, 0x77, 0x95, 0xae, 0x4c, 0xd8, 0x3a, 0x1, 0xe3, 0x34, 0xd6, 0xed, 0xf, 0x9b, 0x79, 0x42, 0xa0, 0xf1, 0x13, 0x28, 0xca, 0x5e, 0xbc, 0x87, 0x65, 0xb2, 0x50, 0x6b, 0x89, 0x1d, 0xff, 0xc4, 0x26}, + {0x0, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x3, 0x3b, 0xd8, 0x96, 0x75, 0x4d, 0xae, 0x3d, 0xde, 0xe6, 0x5, 0xdd, 0x3e, 0x6, 0xe5, 0x76, 0x95, 0xad, 0x4e, 0x31, 0xd2, 0xea, 0x9, 0x9a, 0x79, 0x41, 0xa2, 0x7a, 0x99, 0xa1, 0x42, 0xd1, 0x32, 0xa, 0xe9, 0xa7, 0x44, 0x7c, 0x9f, 0xc, 0xef, 0xd7, 0x34, 0xec, 0xf, 0x37, 0xd4, 0x47, 0xa4, 0x9c, 0x7f, 0x62, 0x81, 0xb9, 0x5a, 0xc9, 0x2a, 0x12, 0xf1, 0x29, 0xca, 0xf2, 0x11, 0x82, 0x61, 0x59, 0xba, 0xf4, 0x17, 0x2f, 0xcc, 0x5f, 0xbc, 0x84, 0x67, 0xbf, 0x5c, 0x64, 0x87, 0x14, 0xf7, 0xcf, 0x2c, 0x53, 0xb0, 0x88, 0x6b, 0xf8, 0x1b, 0x23, 0xc0, 0x18, 0xfb, 0xc3, 0x20, 0xb3, 0x50, 0x68, 0x8b, 0xc5, 0x26, 0x1e, 0xfd, 0x6e, 0x8d, 0xb5, 0x56, 0x8e, 0x6d, 0x55, 0xb6, 0x25, 0xc6, 0xfe, 0x1d, 0xc4, 0x27, 0x1f, 0xfc, 0x6f, 0x8c, 0xb4, 0x57, 0x8f, 0x6c, 0x54, 0xb7, 0x24, 0xc7, 0xff, 0x1c, 0x52, 0xb1, 0x89, 0x6a, 0xf9, 0x1a, 0x22, 0xc1, 0x19, 0xfa, 0xc2, 0x21, 0xb2, 0x51, 0x69, 0x8a, 0xf5, 0x16, 0x2e, 0xcd, 0x5e, 0xbd, 0x85, 0x66, 0xbe, 0x5d, 0x65, 0x86, 0x15, 0xf6, 0xce, 0x2d, 0x63, 0x80, 0xb8, 0x5b, 0xc8, 0x2b, 0x13, 0xf0, 0x28, 0xcb, 0xf3, 0x10, 0x83, 0x60, 0x58, 0xbb, 0xa6, 0x45, 0x7d, 0x9e, 0xd, 0xee, 0xd6, 0x35, 0xed, 0xe, 0x36, 0xd5, 0x46, 0xa5, 0x9d, 0x7e, 0x30, 0xd3, 0xeb, 0x8, 0x9b, 0x78, 0x40, 0xa3, 0x7b, 0x98, 0xa0, 0x43, 0xd0, 0x33, 0xb, 0xe8, 0x97, 0x74, 0x4c, 0xaf, 0x3c, 0xdf, 0xe7, 0x4, 0xdc, 0x3f, 0x7, 0xe4, 0x77, 0x94, 0xac, 0x4f, 0x1, 0xe2, 0xda, 0x39, 0xaa, 0x49, 0x71, 0x92, 0x4a, 0xa9, 0x91, 0x72, 0xe1, 0x2, 0x3a, 0xd9}, + {0x0, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5, 0xe6, 0x2, 0x33, 0xd7, 0x51, 0xb5, 0x84, 0x60, 0x95, 0x71, 0x40, 0xa4, 0x22, 0xc6, 0xf7, 0x13, 0xd1, 0x35, 0x4, 0xe0, 0x66, 0x82, 0xb3, 0x57, 0xa2, 0x46, 0x77, 0x93, 0x15, 0xf1, 0xc0, 0x24, 0x37, 0xd3, 0xe2, 0x6, 0x80, 0x64, 0x55, 0xb1, 0x44, 0xa0, 0x91, 0x75, 0xf3, 0x17, 0x26, 0xc2, 0xbf, 0x5b, 0x6a, 0x8e, 0x8, 0xec, 0xdd, 0x39, 0xcc, 0x28, 0x19, 0xfd, 0x7b, 0x9f, 0xae, 0x4a, 0x59, 0xbd, 0x8c, 0x68, 0xee, 0xa, 0x3b, 0xdf, 0x2a, 0xce, 0xff, 0x1b, 0x9d, 0x79, 0x48, 0xac, 0x6e, 0x8a, 0xbb, 0x5f, 0xd9, 0x3d, 0xc, 0xe8, 0x1d, 0xf9, 0xc8, 0x2c, 0xaa, 0x4e, 0x7f, 0x9b, 0x88, 0x6c, 0x5d, 0xb9, 0x3f, 0xdb, 0xea, 0xe, 0xfb, 0x1f, 0x2e, 0xca, 0x4c, 0xa8, 0x99, 0x7d, 0x63, 0x87, 0xb6, 0x52, 0xd4, 0x30, 0x1, 0xe5, 0x10, 0xf4, 0xc5, 0x21, 0xa7, 0x43, 0x72, 0x96, 0x85, 0x61, 0x50, 0xb4, 0x32, 0xd6, 0xe7, 0x3, 0xf6, 0x12, 0x23, 0xc7, 0x41, 0xa5, 0x94, 0x70, 0xb2, 0x56, 0x67, 0x83, 0x5, 0xe1, 0xd0, 0x34, 0xc1, 0x25, 0x14, 0xf0, 0x76, 0x92, 0xa3, 0x47, 0x54, 0xb0, 0x81, 0x65, 0xe3, 0x7, 0x36, 0xd2, 0x27, 0xc3, 0xf2, 0x16, 0x90, 0x74, 0x45, 0xa1, 0xdc, 0x38, 0x9, 0xed, 0x6b, 0x8f, 0xbe, 0x5a, 0xaf, 0x4b, 0x7a, 0x9e, 0x18, 0xfc, 0xcd, 0x29, 0x3a, 0xde, 0xef, 0xb, 0x8d, 0x69, 0x58, 0xbc, 0x49, 0xad, 0x9c, 0x78, 0xfe, 0x1a, 0x2b, 0xcf, 0xd, 0xe9, 0xd8, 0x3c, 0xba, 0x5e, 0x6f, 0x8b, 0x7e, 0x9a, 0xab, 0x4f, 0xc9, 0x2d, 0x1c, 0xf8, 0xeb, 0xf, 0x3e, 0xda, 0x5c, 0xb8, 0x89, 0x6d, 0x98, 0x7c, 0x4d, 0xa9, 0x2f, 0xcb, 0xfa, 0x1e}, + {0x0, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa, 0xf6, 0x13, 0x21, 0xc4, 0x45, 0xa0, 0x92, 0x77, 0x8d, 0x68, 0x5a, 0xbf, 0x3e, 0xdb, 0xe9, 0xc, 0xf1, 0x14, 0x26, 0xc3, 0x42, 0xa7, 0x95, 0x70, 0x8a, 0x6f, 0x5d, 0xb8, 0x39, 0xdc, 0xee, 0xb, 0x7, 0xe2, 0xd0, 0x35, 0xb4, 0x51, 0x63, 0x86, 0x7c, 0x99, 0xab, 0x4e, 0xcf, 0x2a, 0x18, 0xfd, 0xff, 0x1a, 0x28, 0xcd, 0x4c, 0xa9, 0x9b, 0x7e, 0x84, 0x61, 0x53, 0xb6, 0x37, 0xd2, 0xe0, 0x5, 0x9, 0xec, 0xde, 0x3b, 0xba, 0x5f, 0x6d, 0x88, 0x72, 0x97, 0xa5, 0x40, 0xc1, 0x24, 0x16, 0xf3, 0xe, 0xeb, 0xd9, 0x3c, 0xbd, 0x58, 0x6a, 0x8f, 0x75, 0x90, 0xa2, 0x47, 0xc6, 0x23, 0x11, 0xf4, 0xf8, 0x1d, 0x2f, 0xca, 0x4b, 0xae, 0x9c, 0x79, 0x83, 0x66, 0x54, 0xb1, 0x30, 0xd5, 0xe7, 0x2, 0xe3, 0x6, 0x34, 0xd1, 0x50, 0xb5, 0x87, 0x62, 0x98, 0x7d, 0x4f, 0xaa, 0x2b, 0xce, 0xfc, 0x19, 0x15, 0xf0, 0xc2, 0x27, 0xa6, 0x43, 0x71, 0x94, 0x6e, 0x8b, 0xb9, 0x5c, 0xdd, 0x38, 0xa, 0xef, 0x12, 0xf7, 0xc5, 0x20, 0xa1, 0x44, 0x76, 0x93, 0x69, 0x8c, 0xbe, 0x5b, 0xda, 0x3f, 0xd, 0xe8, 0xe4, 0x1, 0x33, 0xd6, 0x57, 0xb2, 0x80, 0x65, 0x9f, 0x7a, 0x48, 0xad, 0x2c, 0xc9, 0xfb, 0x1e, 0x1c, 0xf9, 0xcb, 0x2e, 0xaf, 0x4a, 0x78, 0x9d, 0x67, 0x82, 0xb0, 0x55, 0xd4, 0x31, 0x3, 0xe6, 0xea, 0xf, 0x3d, 0xd8, 0x59, 0xbc, 0x8e, 0x6b, 0x91, 0x74, 0x46, 0xa3, 0x22, 0xc7, 0xf5, 0x10, 0xed, 0x8, 0x3a, 0xdf, 0x5e, 0xbb, 0x89, 0x6c, 0x96, 0x73, 0x41, 0xa4, 0x25, 0xc0, 0xf2, 0x17, 0x1b, 0xfe, 0xcc, 0x29, 0xa8, 0x4d, 0x7f, 0x9a, 0x60, 0x85, 0xb7, 0x52, 0xd3, 0x36, 0x4, 0xe1}, + {0x0, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0xd, 0xeb, 0xc6, 0x20, 0x17, 0xf1, 0x79, 0x9f, 0xa8, 0x4e, 0xa5, 0x43, 0x74, 0x92, 0x1a, 0xfc, 0xcb, 0x2d, 0x91, 0x77, 0x40, 0xa6, 0x2e, 0xc8, 0xff, 0x19, 0xf2, 0x14, 0x23, 0xc5, 0x4d, 0xab, 0x9c, 0x7a, 0x57, 0xb1, 0x86, 0x60, 0xe8, 0xe, 0x39, 0xdf, 0x34, 0xd2, 0xe5, 0x3, 0x8b, 0x6d, 0x5a, 0xbc, 0x3f, 0xd9, 0xee, 0x8, 0x80, 0x66, 0x51, 0xb7, 0x5c, 0xba, 0x8d, 0x6b, 0xe3, 0x5, 0x32, 0xd4, 0xf9, 0x1f, 0x28, 0xce, 0x46, 0xa0, 0x97, 0x71, 0x9a, 0x7c, 0x4b, 0xad, 0x25, 0xc3, 0xf4, 0x12, 0xae, 0x48, 0x7f, 0x99, 0x11, 0xf7, 0xc0, 0x26, 0xcd, 0x2b, 0x1c, 0xfa, 0x72, 0x94, 0xa3, 0x45, 0x68, 0x8e, 0xb9, 0x5f, 0xd7, 0x31, 0x6, 0xe0, 0xb, 0xed, 0xda, 0x3c, 0xb4, 0x52, 0x65, 0x83, 0x7e, 0x98, 0xaf, 0x49, 0xc1, 0x27, 0x10, 0xf6, 0x1d, 0xfb, 0xcc, 0x2a, 0xa2, 0x44, 0x73, 0x95, 0xb8, 0x5e, 0x69, 0x8f, 0x7, 0xe1, 0xd6, 0x30, 0xdb, 0x3d, 0xa, 0xec, 0x64, 0x82, 0xb5, 0x53, 0xef, 0x9, 0x3e, 0xd8, 0x50, 0xb6, 0x81, 0x67, 0x8c, 0x6a, 0x5d, 0xbb, 0x33, 0xd5, 0xe2, 0x4, 0x29, 0xcf, 0xf8, 0x1e, 0x96, 0x70, 0x47, 0xa1, 0x4a, 0xac, 0x9b, 0x7d, 0xf5, 0x13, 0x24, 0xc2, 0x41, 0xa7, 0x90, 0x76, 0xfe, 0x18, 0x2f, 0xc9, 0x22, 0xc4, 0xf3, 0x15, 0x9d, 0x7b, 0x4c, 0xaa, 0x87, 0x61, 0x56, 0xb0, 0x38, 0xde, 0xe9, 0xf, 0xe4, 0x2, 0x35, 0xd3, 0x5b, 0xbd, 0x8a, 0x6c, 0xd0, 0x36, 0x1, 0xe7, 0x6f, 0x89, 0xbe, 0x58, 0xb3, 0x55, 0x62, 0x84, 0xc, 0xea, 0xdd, 0x3b, 0x16, 0xf0, 0xc7, 0x21, 0xa9, 0x4f, 0x78, 0x9e, 0x75, 0x93, 0xa4, 0x42, 0xca, 0x2c, 0x1b, 0xfd}, + {0x0, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x3, 0xe4, 0xd6, 0x31, 0x5, 0xe2, 0x6d, 0x8a, 0xbe, 0x59, 0xbd, 0x5a, 0x6e, 0x89, 0x6, 0xe1, 0xd5, 0x32, 0xb1, 0x56, 0x62, 0x85, 0xa, 0xed, 0xd9, 0x3e, 0xda, 0x3d, 0x9, 0xee, 0x61, 0x86, 0xb2, 0x55, 0x67, 0x80, 0xb4, 0x53, 0xdc, 0x3b, 0xf, 0xe8, 0xc, 0xeb, 0xdf, 0x38, 0xb7, 0x50, 0x64, 0x83, 0x7f, 0x98, 0xac, 0x4b, 0xc4, 0x23, 0x17, 0xf0, 0x14, 0xf3, 0xc7, 0x20, 0xaf, 0x48, 0x7c, 0x9b, 0xa9, 0x4e, 0x7a, 0x9d, 0x12, 0xf5, 0xc1, 0x26, 0xc2, 0x25, 0x11, 0xf6, 0x79, 0x9e, 0xaa, 0x4d, 0xce, 0x29, 0x1d, 0xfa, 0x75, 0x92, 0xa6, 0x41, 0xa5, 0x42, 0x76, 0x91, 0x1e, 0xf9, 0xcd, 0x2a, 0x18, 0xff, 0xcb, 0x2c, 0xa3, 0x44, 0x70, 0x97, 0x73, 0x94, 0xa0, 0x47, 0xc8, 0x2f, 0x1b, 0xfc, 0xfe, 0x19, 0x2d, 0xca, 0x45, 0xa2, 0x96, 0x71, 0x95, 0x72, 0x46, 0xa1, 0x2e, 0xc9, 0xfd, 0x1a, 0x28, 0xcf, 0xfb, 0x1c, 0x93, 0x74, 0x40, 0xa7, 0x43, 0xa4, 0x90, 0x77, 0xf8, 0x1f, 0x2b, 0xcc, 0x4f, 0xa8, 0x9c, 0x7b, 0xf4, 0x13, 0x27, 0xc0, 0x24, 0xc3, 0xf7, 0x10, 0x9f, 0x78, 0x4c, 0xab, 0x99, 0x7e, 0x4a, 0xad, 0x22, 0xc5, 0xf1, 0x16, 0xf2, 0x15, 0x21, 0xc6, 0x49, 0xae, 0x9a, 0x7d, 0x81, 0x66, 0x52, 0xb5, 0x3a, 0xdd, 0xe9, 0xe, 0xea, 0xd, 0x39, 0xde, 0x51, 0xb6, 0x82, 0x65, 0x57, 0xb0, 0x84, 0x63, 0xec, 0xb, 0x3f, 0xd8, 0x3c, 0xdb, 0xef, 0x8, 0x87, 0x60, 0x54, 0xb3, 0x30, 0xd7, 0xe3, 0x4, 0x8b, 0x6c, 0x58, 0xbf, 0x5b, 0xbc, 0x88, 0x6f, 0xe0, 0x7, 0x33, 0xd4, 0xe6, 0x1, 0x35, 0xd2, 0x5d, 0xba, 0x8e, 0x69, 0x8d, 0x6a, 0x5e, 0xb9, 0x36, 0xd1, 0xe5, 0x2}, + {0x0, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1, 0x26, 0xce, 0xeb, 0x3, 0xa1, 0x49, 0x6c, 0x84, 0x35, 0xdd, 0xf8, 0x10, 0xb2, 0x5a, 0x7f, 0x97, 0x4c, 0xa4, 0x81, 0x69, 0xcb, 0x23, 0x6, 0xee, 0x5f, 0xb7, 0x92, 0x7a, 0xd8, 0x30, 0x15, 0xfd, 0x6a, 0x82, 0xa7, 0x4f, 0xed, 0x5, 0x20, 0xc8, 0x79, 0x91, 0xb4, 0x5c, 0xfe, 0x16, 0x33, 0xdb, 0x98, 0x70, 0x55, 0xbd, 0x1f, 0xf7, 0xd2, 0x3a, 0x8b, 0x63, 0x46, 0xae, 0xc, 0xe4, 0xc1, 0x29, 0xbe, 0x56, 0x73, 0x9b, 0x39, 0xd1, 0xf4, 0x1c, 0xad, 0x45, 0x60, 0x88, 0x2a, 0xc2, 0xe7, 0xf, 0xd4, 0x3c, 0x19, 0xf1, 0x53, 0xbb, 0x9e, 0x76, 0xc7, 0x2f, 0xa, 0xe2, 0x40, 0xa8, 0x8d, 0x65, 0xf2, 0x1a, 0x3f, 0xd7, 0x75, 0x9d, 0xb8, 0x50, 0xe1, 0x9, 0x2c, 0xc4, 0x66, 0x8e, 0xab, 0x43, 0x2d, 0xc5, 0xe0, 0x8, 0xaa, 0x42, 0x67, 0x8f, 0x3e, 0xd6, 0xf3, 0x1b, 0xb9, 0x51, 0x74, 0x9c, 0xb, 0xe3, 0xc6, 0x2e, 0x8c, 0x64, 0x41, 0xa9, 0x18, 0xf0, 0xd5, 0x3d, 0x9f, 0x77, 0x52, 0xba, 0x61, 0x89, 0xac, 0x44, 0xe6, 0xe, 0x2b, 0xc3, 0x72, 0x9a, 0xbf, 0x57, 0xf5, 0x1d, 0x38, 0xd0, 0x47, 0xaf, 0x8a, 0x62, 0xc0, 0x28, 0xd, 0xe5, 0x54, 0xbc, 0x99, 0x71, 0xd3, 0x3b, 0x1e, 0xf6, 0xb5, 0x5d, 0x78, 0x90, 0x32, 0xda, 0xff, 0x17, 0xa6, 0x4e, 0x6b, 0x83, 0x21, 0xc9, 0xec, 0x4, 0x93, 0x7b, 0x5e, 0xb6, 0x14, 0xfc, 0xd9, 0x31, 0x80, 0x68, 0x4d, 0xa5, 0x7, 0xef, 0xca, 0x22, 0xf9, 0x11, 0x34, 0xdc, 0x7e, 0x96, 0xb3, 0x5b, 0xea, 0x2, 0x27, 0xcf, 0x6d, 0x85, 0xa0, 0x48, 0xdf, 0x37, 0x12, 0xfa, 0x58, 0xb0, 0x95, 0x7d, 0xcc, 0x24, 0x1, 0xe9, 0x4b, 0xa3, 0x86, 0x6e}, + {0x0, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe, 0x36, 0xdf, 0xf9, 0x10, 0xb5, 0x5c, 0x7a, 0x93, 0x2d, 0xc4, 0xe2, 0xb, 0xae, 0x47, 0x61, 0x88, 0x6c, 0x85, 0xa3, 0x4a, 0xef, 0x6, 0x20, 0xc9, 0x77, 0x9e, 0xb8, 0x51, 0xf4, 0x1d, 0x3b, 0xd2, 0x5a, 0xb3, 0x95, 0x7c, 0xd9, 0x30, 0x16, 0xff, 0x41, 0xa8, 0x8e, 0x67, 0xc2, 0x2b, 0xd, 0xe4, 0xd8, 0x31, 0x17, 0xfe, 0x5b, 0xb2, 0x94, 0x7d, 0xc3, 0x2a, 0xc, 0xe5, 0x40, 0xa9, 0x8f, 0x66, 0xee, 0x7, 0x21, 0xc8, 0x6d, 0x84, 0xa2, 0x4b, 0xf5, 0x1c, 0x3a, 0xd3, 0x76, 0x9f, 0xb9, 0x50, 0xb4, 0x5d, 0x7b, 0x92, 0x37, 0xde, 0xf8, 0x11, 0xaf, 0x46, 0x60, 0x89, 0x2c, 0xc5, 0xe3, 0xa, 0x82, 0x6b, 0x4d, 0xa4, 0x1, 0xe8, 0xce, 0x27, 0x99, 0x70, 0x56, 0xbf, 0x1a, 0xf3, 0xd5, 0x3c, 0xad, 0x44, 0x62, 0x8b, 0x2e, 0xc7, 0xe1, 0x8, 0xb6, 0x5f, 0x79, 0x90, 0x35, 0xdc, 0xfa, 0x13, 0x9b, 0x72, 0x54, 0xbd, 0x18, 0xf1, 0xd7, 0x3e, 0x80, 0x69, 0x4f, 0xa6, 0x3, 0xea, 0xcc, 0x25, 0xc1, 0x28, 0xe, 0xe7, 0x42, 0xab, 0x8d, 0x64, 0xda, 0x33, 0x15, 0xfc, 0x59, 0xb0, 0x96, 0x7f, 0xf7, 0x1e, 0x38, 0xd1, 0x74, 0x9d, 0xbb, 0x52, 0xec, 0x5, 0x23, 0xca, 0x6f, 0x86, 0xa0, 0x49, 0x75, 0x9c, 0xba, 0x53, 0xf6, 0x1f, 0x39, 0xd0, 0x6e, 0x87, 0xa1, 0x48, 0xed, 0x4, 0x22, 0xcb, 0x43, 0xaa, 0x8c, 0x65, 0xc0, 0x29, 0xf, 0xe6, 0x58, 0xb1, 0x97, 0x7e, 0xdb, 0x32, 0x14, 0xfd, 0x19, 0xf0, 0xd6, 0x3f, 0x9a, 0x73, 0x55, 0xbc, 0x2, 0xeb, 0xcd, 0x24, 0x81, 0x68, 0x4e, 0xa7, 0x2f, 0xc6, 0xe0, 0x9, 0xac, 0x45, 0x63, 0x8a, 0x34, 0xdd, 0xfb, 0x12, 0xb7, 0x5e, 0x78, 0x91}, + {0x0, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x3, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf, 0x6, 0xec, 0xcf, 0x25, 0x89, 0x63, 0x40, 0xaa, 0x5, 0xef, 0xcc, 0x26, 0x8a, 0x60, 0x43, 0xa9, 0xc, 0xe6, 0xc5, 0x2f, 0x83, 0x69, 0x4a, 0xa0, 0xf, 0xe5, 0xc6, 0x2c, 0x80, 0x6a, 0x49, 0xa3, 0xa, 0xe0, 0xc3, 0x29, 0x85, 0x6f, 0x4c, 0xa6, 0x9, 0xe3, 0xc0, 0x2a, 0x86, 0x6c, 0x4f, 0xa5, 0x18, 0xf2, 0xd1, 0x3b, 0x97, 0x7d, 0x5e, 0xb4, 0x1b, 0xf1, 0xd2, 0x38, 0x94, 0x7e, 0x5d, 0xb7, 0x1e, 0xf4, 0xd7, 0x3d, 0x91, 0x7b, 0x58, 0xb2, 0x1d, 0xf7, 0xd4, 0x3e, 0x92, 0x78, 0x5b, 0xb1, 0x14, 0xfe, 0xdd, 0x37, 0x9b, 0x71, 0x52, 0xb8, 0x17, 0xfd, 0xde, 0x34, 0x98, 0x72, 0x51, 0xbb, 0x12, 0xf8, 0xdb, 0x31, 0x9d, 0x77, 0x54, 0xbe, 0x11, 0xfb, 0xd8, 0x32, 0x9e, 0x74, 0x57, 0xbd, 0x30, 0xda, 0xf9, 0x13, 0xbf, 0x55, 0x76, 0x9c, 0x33, 0xd9, 0xfa, 0x10, 0xbc, 0x56, 0x75, 0x9f, 0x36, 0xdc, 0xff, 0x15, 0xb9, 0x53, 0x70, 0x9a, 0x35, 0xdf, 0xfc, 0x16, 0xba, 0x50, 0x73, 0x99, 0x3c, 0xd6, 0xf5, 0x1f, 0xb3, 0x59, 0x7a, 0x90, 0x3f, 0xd5, 0xf6, 0x1c, 0xb0, 0x5a, 0x79, 0x93, 0x3a, 0xd0, 0xf3, 0x19, 0xb5, 0x5f, 0x7c, 0x96, 0x39, 0xd3, 0xf0, 0x1a, 0xb6, 0x5c, 0x7f, 0x95, 0x28, 0xc2, 0xe1, 0xb, 0xa7, 0x4d, 0x6e, 0x84, 0x2b, 0xc1, 0xe2, 0x8, 0xa4, 0x4e, 0x6d, 0x87, 0x2e, 0xc4, 0xe7, 0xd, 0xa1, 0x4b, 0x68, 0x82, 0x2d, 0xc7, 0xe4, 0xe, 0xa2, 0x48, 0x6b, 0x81, 0x24, 0xce, 0xed, 0x7, 0xab, 0x41, 0x62, 0x88, 0x27, 0xcd, 0xee, 0x4, 0xa8, 0x42, 0x61, 0x8b, 0x22, 0xc8, 0xeb, 0x1, 0xad, 0x47, 0x64, 0x8e, 0x21, 0xcb, 0xe8, 0x2, 0xae, 0x44, 0x67, 0x8d}, + {0x0, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0xb, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0, 0x16, 0xfd, 0xdd, 0x36, 0x9d, 0x76, 0x56, 0xbd, 0x1d, 0xf6, 0xd6, 0x3d, 0x96, 0x7d, 0x5d, 0xb6, 0x2c, 0xc7, 0xe7, 0xc, 0xa7, 0x4c, 0x6c, 0x87, 0x27, 0xcc, 0xec, 0x7, 0xac, 0x47, 0x67, 0x8c, 0x3a, 0xd1, 0xf1, 0x1a, 0xb1, 0x5a, 0x7a, 0x91, 0x31, 0xda, 0xfa, 0x11, 0xba, 0x51, 0x71, 0x9a, 0x58, 0xb3, 0x93, 0x78, 0xd3, 0x38, 0x18, 0xf3, 0x53, 0xb8, 0x98, 0x73, 0xd8, 0x33, 0x13, 0xf8, 0x4e, 0xa5, 0x85, 0x6e, 0xc5, 0x2e, 0xe, 0xe5, 0x45, 0xae, 0x8e, 0x65, 0xce, 0x25, 0x5, 0xee, 0x74, 0x9f, 0xbf, 0x54, 0xff, 0x14, 0x34, 0xdf, 0x7f, 0x94, 0xb4, 0x5f, 0xf4, 0x1f, 0x3f, 0xd4, 0x62, 0x89, 0xa9, 0x42, 0xe9, 0x2, 0x22, 0xc9, 0x69, 0x82, 0xa2, 0x49, 0xe2, 0x9, 0x29, 0xc2, 0xb0, 0x5b, 0x7b, 0x90, 0x3b, 0xd0, 0xf0, 0x1b, 0xbb, 0x50, 0x70, 0x9b, 0x30, 0xdb, 0xfb, 0x10, 0xa6, 0x4d, 0x6d, 0x86, 0x2d, 0xc6, 0xe6, 0xd, 0xad, 0x46, 0x66, 0x8d, 0x26, 0xcd, 0xed, 0x6, 0x9c, 0x77, 0x57, 0xbc, 0x17, 0xfc, 0xdc, 0x37, 0x97, 0x7c, 0x5c, 0xb7, 0x1c, 0xf7, 0xd7, 0x3c, 0x8a, 0x61, 0x41, 0xaa, 0x1, 0xea, 0xca, 0x21, 0x81, 0x6a, 0x4a, 0xa1, 0xa, 0xe1, 0xc1, 0x2a, 0xe8, 0x3, 0x23, 0xc8, 0x63, 0x88, 0xa8, 0x43, 0xe3, 0x8, 0x28, 0xc3, 0x68, 0x83, 0xa3, 0x48, 0xfe, 0x15, 0x35, 0xde, 0x75, 0x9e, 0xbe, 0x55, 0xf5, 0x1e, 0x3e, 0xd5, 0x7e, 0x95, 0xb5, 0x5e, 0xc4, 0x2f, 0xf, 0xe4, 0x4f, 0xa4, 0x84, 0x6f, 0xcf, 0x24, 0x4, 0xef, 0x44, 0xaf, 0x8f, 0x64, 0xd2, 0x39, 0x19, 0xf2, 0x59, 0xb2, 0x92, 0x79, 0xd9, 0x32, 0x12, 0xf9, 0x52, 0xb9, 0x99, 0x72}, + {0x0, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d, 0x66, 0x8a, 0xa3, 0x4f, 0xf1, 0x1d, 0x34, 0xd8, 0x55, 0xb9, 0x90, 0x7c, 0xc2, 0x2e, 0x7, 0xeb, 0xcc, 0x20, 0x9, 0xe5, 0x5b, 0xb7, 0x9e, 0x72, 0xff, 0x13, 0x3a, 0xd6, 0x68, 0x84, 0xad, 0x41, 0xaa, 0x46, 0x6f, 0x83, 0x3d, 0xd1, 0xf8, 0x14, 0x99, 0x75, 0x5c, 0xb0, 0xe, 0xe2, 0xcb, 0x27, 0x85, 0x69, 0x40, 0xac, 0x12, 0xfe, 0xd7, 0x3b, 0xb6, 0x5a, 0x73, 0x9f, 0x21, 0xcd, 0xe4, 0x8, 0xe3, 0xf, 0x26, 0xca, 0x74, 0x98, 0xb1, 0x5d, 0xd0, 0x3c, 0x15, 0xf9, 0x47, 0xab, 0x82, 0x6e, 0x49, 0xa5, 0x8c, 0x60, 0xde, 0x32, 0x1b, 0xf7, 0x7a, 0x96, 0xbf, 0x53, 0xed, 0x1, 0x28, 0xc4, 0x2f, 0xc3, 0xea, 0x6, 0xb8, 0x54, 0x7d, 0x91, 0x1c, 0xf0, 0xd9, 0x35, 0x8b, 0x67, 0x4e, 0xa2, 0x17, 0xfb, 0xd2, 0x3e, 0x80, 0x6c, 0x45, 0xa9, 0x24, 0xc8, 0xe1, 0xd, 0xb3, 0x5f, 0x76, 0x9a, 0x71, 0x9d, 0xb4, 0x58, 0xe6, 0xa, 0x23, 0xcf, 0x42, 0xae, 0x87, 0x6b, 0xd5, 0x39, 0x10, 0xfc, 0xdb, 0x37, 0x1e, 0xf2, 0x4c, 0xa0, 0x89, 0x65, 0xe8, 0x4, 0x2d, 0xc1, 0x7f, 0x93, 0xba, 0x56, 0xbd, 0x51, 0x78, 0x94, 0x2a, 0xc6, 0xef, 0x3, 0x8e, 0x62, 0x4b, 0xa7, 0x19, 0xf5, 0xdc, 0x30, 0x92, 0x7e, 0x57, 0xbb, 0x5, 0xe9, 0xc0, 0x2c, 0xa1, 0x4d, 0x64, 0x88, 0x36, 0xda, 0xf3, 0x1f, 0xf4, 0x18, 0x31, 0xdd, 0x63, 0x8f, 0xa6, 0x4a, 0xc7, 0x2b, 0x2, 0xee, 0x50, 0xbc, 0x95, 0x79, 0x5e, 0xb2, 0x9b, 0x77, 0xc9, 0x25, 0xc, 0xe0, 0x6d, 0x81, 0xa8, 0x44, 0xfa, 0x16, 0x3f, 0xd3, 0x38, 0xd4, 0xfd, 0x11, 0xaf, 0x43, 0x6a, 0x86, 0xb, 0xe7, 0xce, 0x22, 0x9c, 0x70, 0x59, 0xb5}, + {0x0, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82, 0x76, 0x9b, 0xb1, 0x5c, 0xe5, 0x8, 0x22, 0xcf, 0x4d, 0xa0, 0x8a, 0x67, 0xde, 0x33, 0x19, 0xf4, 0xec, 0x1, 0x2b, 0xc6, 0x7f, 0x92, 0xb8, 0x55, 0xd7, 0x3a, 0x10, 0xfd, 0x44, 0xa9, 0x83, 0x6e, 0x9a, 0x77, 0x5d, 0xb0, 0x9, 0xe4, 0xce, 0x23, 0xa1, 0x4c, 0x66, 0x8b, 0x32, 0xdf, 0xf5, 0x18, 0xc5, 0x28, 0x2, 0xef, 0x56, 0xbb, 0x91, 0x7c, 0xfe, 0x13, 0x39, 0xd4, 0x6d, 0x80, 0xaa, 0x47, 0xb3, 0x5e, 0x74, 0x99, 0x20, 0xcd, 0xe7, 0xa, 0x88, 0x65, 0x4f, 0xa2, 0x1b, 0xf6, 0xdc, 0x31, 0x29, 0xc4, 0xee, 0x3, 0xba, 0x57, 0x7d, 0x90, 0x12, 0xff, 0xd5, 0x38, 0x81, 0x6c, 0x46, 0xab, 0x5f, 0xb2, 0x98, 0x75, 0xcc, 0x21, 0xb, 0xe6, 0x64, 0x89, 0xa3, 0x4e, 0xf7, 0x1a, 0x30, 0xdd, 0x97, 0x7a, 0x50, 0xbd, 0x4, 0xe9, 0xc3, 0x2e, 0xac, 0x41, 0x6b, 0x86, 0x3f, 0xd2, 0xf8, 0x15, 0xe1, 0xc, 0x26, 0xcb, 0x72, 0x9f, 0xb5, 0x58, 0xda, 0x37, 0x1d, 0xf0, 0x49, 0xa4, 0x8e, 0x63, 0x7b, 0x96, 0xbc, 0x51, 0xe8, 0x5, 0x2f, 0xc2, 0x40, 0xad, 0x87, 0x6a, 0xd3, 0x3e, 0x14, 0xf9, 0xd, 0xe0, 0xca, 0x27, 0x9e, 0x73, 0x59, 0xb4, 0x36, 0xdb, 0xf1, 0x1c, 0xa5, 0x48, 0x62, 0x8f, 0x52, 0xbf, 0x95, 0x78, 0xc1, 0x2c, 0x6, 0xeb, 0x69, 0x84, 0xae, 0x43, 0xfa, 0x17, 0x3d, 0xd0, 0x24, 0xc9, 0xe3, 0xe, 0xb7, 0x5a, 0x70, 0x9d, 0x1f, 0xf2, 0xd8, 0x35, 0x8c, 0x61, 0x4b, 0xa6, 0xbe, 0x53, 0x79, 0x94, 0x2d, 0xc0, 0xea, 0x7, 0x85, 0x68, 0x42, 0xaf, 0x16, 0xfb, 0xd1, 0x3c, 0xc8, 0x25, 0xf, 0xe2, 0x5b, 0xb6, 0x9c, 0x71, 0xf3, 0x1e, 0x34, 0xd9, 0x60, 0x8d, 0xa7, 0x4a}, + {0x0, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0xc, 0xbc, 0x52, 0x7d, 0x93, 0x46, 0xa8, 0x87, 0x69, 0xd9, 0x37, 0x18, 0xf6, 0x65, 0x8b, 0xa4, 0x4a, 0xfa, 0x14, 0x3b, 0xd5, 0x8c, 0x62, 0x4d, 0xa3, 0x13, 0xfd, 0xd2, 0x3c, 0xaf, 0x41, 0x6e, 0x80, 0x30, 0xde, 0xf1, 0x1f, 0xca, 0x24, 0xb, 0xe5, 0x55, 0xbb, 0x94, 0x7a, 0xe9, 0x7, 0x28, 0xc6, 0x76, 0x98, 0xb7, 0x59, 0x5, 0xeb, 0xc4, 0x2a, 0x9a, 0x74, 0x5b, 0xb5, 0x26, 0xc8, 0xe7, 0x9, 0xb9, 0x57, 0x78, 0x96, 0x43, 0xad, 0x82, 0x6c, 0xdc, 0x32, 0x1d, 0xf3, 0x60, 0x8e, 0xa1, 0x4f, 0xff, 0x11, 0x3e, 0xd0, 0x89, 0x67, 0x48, 0xa6, 0x16, 0xf8, 0xd7, 0x39, 0xaa, 0x44, 0x6b, 0x85, 0x35, 0xdb, 0xf4, 0x1a, 0xcf, 0x21, 0xe, 0xe0, 0x50, 0xbe, 0x91, 0x7f, 0xec, 0x2, 0x2d, 0xc3, 0x73, 0x9d, 0xb2, 0x5c, 0xa, 0xe4, 0xcb, 0x25, 0x95, 0x7b, 0x54, 0xba, 0x29, 0xc7, 0xe8, 0x6, 0xb6, 0x58, 0x77, 0x99, 0x4c, 0xa2, 0x8d, 0x63, 0xd3, 0x3d, 0x12, 0xfc, 0x6f, 0x81, 0xae, 0x40, 0xf0, 0x1e, 0x31, 0xdf, 0x86, 0x68, 0x47, 0xa9, 0x19, 0xf7, 0xd8, 0x36, 0xa5, 0x4b, 0x64, 0x8a, 0x3a, 0xd4, 0xfb, 0x15, 0xc0, 0x2e, 0x1, 0xef, 0x5f, 0xb1, 0x9e, 0x70, 0xe3, 0xd, 0x22, 0xcc, 0x7c, 0x92, 0xbd, 0x53, 0xf, 0xe1, 0xce, 0x20, 0x90, 0x7e, 0x51, 0xbf, 0x2c, 0xc2, 0xed, 0x3, 0xb3, 0x5d, 0x72, 0x9c, 0x49, 0xa7, 0x88, 0x66, 0xd6, 0x38, 0x17, 0xf9, 0x6a, 0x84, 0xab, 0x45, 0xf5, 0x1b, 0x34, 0xda, 0x83, 0x6d, 0x42, 0xac, 0x1c, 0xf2, 0xdd, 0x33, 0xa0, 0x4e, 0x61, 0x8f, 0x3f, 0xd1, 0xfe, 0x10, 0xc5, 0x2b, 0x4, 0xea, 0x5a, 0xb4, 0x9b, 0x75, 0xe6, 0x8, 0x27, 0xc9, 0x79, 0x97, 0xb8, 0x56}, + {0x0, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x7, 0xb0, 0x5f, 0x73, 0x9c, 0x56, 0xb9, 0x95, 0x7a, 0xcd, 0x22, 0xe, 0xe1, 0x7d, 0x92, 0xbe, 0x51, 0xe6, 0x9, 0x25, 0xca, 0xac, 0x43, 0x6f, 0x80, 0x37, 0xd8, 0xf4, 0x1b, 0x87, 0x68, 0x44, 0xab, 0x1c, 0xf3, 0xdf, 0x30, 0xfa, 0x15, 0x39, 0xd6, 0x61, 0x8e, 0xa2, 0x4d, 0xd1, 0x3e, 0x12, 0xfd, 0x4a, 0xa5, 0x89, 0x66, 0x45, 0xaa, 0x86, 0x69, 0xde, 0x31, 0x1d, 0xf2, 0x6e, 0x81, 0xad, 0x42, 0xf5, 0x1a, 0x36, 0xd9, 0x13, 0xfc, 0xd0, 0x3f, 0x88, 0x67, 0x4b, 0xa4, 0x38, 0xd7, 0xfb, 0x14, 0xa3, 0x4c, 0x60, 0x8f, 0xe9, 0x6, 0x2a, 0xc5, 0x72, 0x9d, 0xb1, 0x5e, 0xc2, 0x2d, 0x1, 0xee, 0x59, 0xb6, 0x9a, 0x75, 0xbf, 0x50, 0x7c, 0x93, 0x24, 0xcb, 0xe7, 0x8, 0x94, 0x7b, 0x57, 0xb8, 0xf, 0xe0, 0xcc, 0x23, 0x8a, 0x65, 0x49, 0xa6, 0x11, 0xfe, 0xd2, 0x3d, 0xa1, 0x4e, 0x62, 0x8d, 0x3a, 0xd5, 0xf9, 0x16, 0xdc, 0x33, 0x1f, 0xf0, 0x47, 0xa8, 0x84, 0x6b, 0xf7, 0x18, 0x34, 0xdb, 0x6c, 0x83, 0xaf, 0x40, 0x26, 0xc9, 0xe5, 0xa, 0xbd, 0x52, 0x7e, 0x91, 0xd, 0xe2, 0xce, 0x21, 0x96, 0x79, 0x55, 0xba, 0x70, 0x9f, 0xb3, 0x5c, 0xeb, 0x4, 0x28, 0xc7, 0x5b, 0xb4, 0x98, 0x77, 0xc0, 0x2f, 0x3, 0xec, 0xcf, 0x20, 0xc, 0xe3, 0x54, 0xbb, 0x97, 0x78, 0xe4, 0xb, 0x27, 0xc8, 0x7f, 0x90, 0xbc, 0x53, 0x99, 0x76, 0x5a, 0xb5, 0x2, 0xed, 0xc1, 0x2e, 0xb2, 0x5d, 0x71, 0x9e, 0x29, 0xc6, 0xea, 0x5, 0x63, 0x8c, 0xa0, 0x4f, 0xf8, 0x17, 0x3b, 0xd4, 0x48, 0xa7, 0x8b, 0x64, 0xd3, 0x3c, 0x10, 0xff, 0x35, 0xda, 0xf6, 0x19, 0xae, 0x41, 0x6d, 0x82, 0x1e, 0xf1, 0xdd, 0x32, 0x85, 0x6a, 0x46, 0xa9}, + {0x0, 0xf0, 0xfd, 0xd, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39, 0xbb, 0x4b, 0x46, 0xb6, 0x5c, 0xac, 0xa1, 0x51, 0x68, 0x98, 0x95, 0x65, 0x8f, 0x7f, 0x72, 0x82, 0x6b, 0x9b, 0x96, 0x66, 0x8c, 0x7c, 0x71, 0x81, 0xb8, 0x48, 0x45, 0xb5, 0x5f, 0xaf, 0xa2, 0x52, 0xd0, 0x20, 0x2d, 0xdd, 0x37, 0xc7, 0xca, 0x3a, 0x3, 0xf3, 0xfe, 0xe, 0xe4, 0x14, 0x19, 0xe9, 0xd6, 0x26, 0x2b, 0xdb, 0x31, 0xc1, 0xcc, 0x3c, 0x5, 0xf5, 0xf8, 0x8, 0xe2, 0x12, 0x1f, 0xef, 0x6d, 0x9d, 0x90, 0x60, 0x8a, 0x7a, 0x77, 0x87, 0xbe, 0x4e, 0x43, 0xb3, 0x59, 0xa9, 0xa4, 0x54, 0xbd, 0x4d, 0x40, 0xb0, 0x5a, 0xaa, 0xa7, 0x57, 0x6e, 0x9e, 0x93, 0x63, 0x89, 0x79, 0x74, 0x84, 0x6, 0xf6, 0xfb, 0xb, 0xe1, 0x11, 0x1c, 0xec, 0xd5, 0x25, 0x28, 0xd8, 0x32, 0xc2, 0xcf, 0x3f, 0xb1, 0x41, 0x4c, 0xbc, 0x56, 0xa6, 0xab, 0x5b, 0x62, 0x92, 0x9f, 0x6f, 0x85, 0x75, 0x78, 0x88, 0xa, 0xfa, 0xf7, 0x7, 0xed, 0x1d, 0x10, 0xe0, 0xd9, 0x29, 0x24, 0xd4, 0x3e, 0xce, 0xc3, 0x33, 0xda, 0x2a, 0x27, 0xd7, 0x3d, 0xcd, 0xc0, 0x30, 0x9, 0xf9, 0xf4, 0x4, 0xee, 0x1e, 0x13, 0xe3, 0x61, 0x91, 0x9c, 0x6c, 0x86, 0x76, 0x7b, 0x8b, 0xb2, 0x42, 0x4f, 0xbf, 0x55, 0xa5, 0xa8, 0x58, 0x67, 0x97, 0x9a, 0x6a, 0x80, 0x70, 0x7d, 0x8d, 0xb4, 0x44, 0x49, 0xb9, 0x53, 0xa3, 0xae, 0x5e, 0xdc, 0x2c, 0x21, 0xd1, 0x3b, 0xcb, 0xc6, 0x36, 0xf, 0xff, 0xf2, 0x2, 0xe8, 0x18, 0x15, 0xe5, 0xc, 0xfc, 0xf1, 0x1, 0xeb, 0x1b, 0x16, 0xe6, 0xdf, 0x2f, 0x22, 0xd2, 0x38, 0xc8, 0xc5, 0x35, 0xb7, 0x47, 0x4a, 0xba, 0x50, 0xa0, 0xad, 0x5d, 0x64, 0x94, 0x99, 0x69, 0x83, 0x73, 0x7e, 0x8e}, + {0x0, 0xf1, 0xff, 0xe, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36, 0xab, 0x5a, 0x54, 0xa5, 0x48, 0xb9, 0xb7, 0x46, 0x70, 0x81, 0x8f, 0x7e, 0x93, 0x62, 0x6c, 0x9d, 0x4b, 0xba, 0xb4, 0x45, 0xa8, 0x59, 0x57, 0xa6, 0x90, 0x61, 0x6f, 0x9e, 0x73, 0x82, 0x8c, 0x7d, 0xe0, 0x11, 0x1f, 0xee, 0x3, 0xf2, 0xfc, 0xd, 0x3b, 0xca, 0xc4, 0x35, 0xd8, 0x29, 0x27, 0xd6, 0x96, 0x67, 0x69, 0x98, 0x75, 0x84, 0x8a, 0x7b, 0x4d, 0xbc, 0xb2, 0x43, 0xae, 0x5f, 0x51, 0xa0, 0x3d, 0xcc, 0xc2, 0x33, 0xde, 0x2f, 0x21, 0xd0, 0xe6, 0x17, 0x19, 0xe8, 0x5, 0xf4, 0xfa, 0xb, 0xdd, 0x2c, 0x22, 0xd3, 0x3e, 0xcf, 0xc1, 0x30, 0x6, 0xf7, 0xf9, 0x8, 0xe5, 0x14, 0x1a, 0xeb, 0x76, 0x87, 0x89, 0x78, 0x95, 0x64, 0x6a, 0x9b, 0xad, 0x5c, 0x52, 0xa3, 0x4e, 0xbf, 0xb1, 0x40, 0x31, 0xc0, 0xce, 0x3f, 0xd2, 0x23, 0x2d, 0xdc, 0xea, 0x1b, 0x15, 0xe4, 0x9, 0xf8, 0xf6, 0x7, 0x9a, 0x6b, 0x65, 0x94, 0x79, 0x88, 0x86, 0x77, 0x41, 0xb0, 0xbe, 0x4f, 0xa2, 0x53, 0x5d, 0xac, 0x7a, 0x8b, 0x85, 0x74, 0x99, 0x68, 0x66, 0x97, 0xa1, 0x50, 0x5e, 0xaf, 0x42, 0xb3, 0xbd, 0x4c, 0xd1, 0x20, 0x2e, 0xdf, 0x32, 0xc3, 0xcd, 0x3c, 0xa, 0xfb, 0xf5, 0x4, 0xe9, 0x18, 0x16, 0xe7, 0xa7, 0x56, 0x58, 0xa9, 0x44, 0xb5, 0xbb, 0x4a, 0x7c, 0x8d, 0x83, 0x72, 0x9f, 0x6e, 0x60, 0x91, 0xc, 0xfd, 0xf3, 0x2, 0xef, 0x1e, 0x10, 0xe1, 0xd7, 0x26, 0x28, 0xd9, 0x34, 0xc5, 0xcb, 0x3a, 0xec, 0x1d, 0x13, 0xe2, 0xf, 0xfe, 0xf0, 0x1, 0x37, 0xc6, 0xc8, 0x39, 0xd4, 0x25, 0x2b, 0xda, 0x47, 0xb6, 0xb8, 0x49, 0xa4, 0x55, 0x5b, 0xaa, 0x9c, 0x6d, 0x63, 0x92, 0x7f, 0x8e, 0x80, 0x71}, + {0x0, 0xf2, 0xf9, 0xb, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27, 0x9b, 0x69, 0x62, 0x90, 0x74, 0x86, 0x8d, 0x7f, 0x58, 0xaa, 0xa1, 0x53, 0xb7, 0x45, 0x4e, 0xbc, 0x2b, 0xd9, 0xd2, 0x20, 0xc4, 0x36, 0x3d, 0xcf, 0xe8, 0x1a, 0x11, 0xe3, 0x7, 0xf5, 0xfe, 0xc, 0xb0, 0x42, 0x49, 0xbb, 0x5f, 0xad, 0xa6, 0x54, 0x73, 0x81, 0x8a, 0x78, 0x9c, 0x6e, 0x65, 0x97, 0x56, 0xa4, 0xaf, 0x5d, 0xb9, 0x4b, 0x40, 0xb2, 0x95, 0x67, 0x6c, 0x9e, 0x7a, 0x88, 0x83, 0x71, 0xcd, 0x3f, 0x34, 0xc6, 0x22, 0xd0, 0xdb, 0x29, 0xe, 0xfc, 0xf7, 0x5, 0xe1, 0x13, 0x18, 0xea, 0x7d, 0x8f, 0x84, 0x76, 0x92, 0x60, 0x6b, 0x99, 0xbe, 0x4c, 0x47, 0xb5, 0x51, 0xa3, 0xa8, 0x5a, 0xe6, 0x14, 0x1f, 0xed, 0x9, 0xfb, 0xf0, 0x2, 0x25, 0xd7, 0xdc, 0x2e, 0xca, 0x38, 0x33, 0xc1, 0xac, 0x5e, 0x55, 0xa7, 0x43, 0xb1, 0xba, 0x48, 0x6f, 0x9d, 0x96, 0x64, 0x80, 0x72, 0x79, 0x8b, 0x37, 0xc5, 0xce, 0x3c, 0xd8, 0x2a, 0x21, 0xd3, 0xf4, 0x6, 0xd, 0xff, 0x1b, 0xe9, 0xe2, 0x10, 0x87, 0x75, 0x7e, 0x8c, 0x68, 0x9a, 0x91, 0x63, 0x44, 0xb6, 0xbd, 0x4f, 0xab, 0x59, 0x52, 0xa0, 0x1c, 0xee, 0xe5, 0x17, 0xf3, 0x1, 0xa, 0xf8, 0xdf, 0x2d, 0x26, 0xd4, 0x30, 0xc2, 0xc9, 0x3b, 0xfa, 0x8, 0x3, 0xf1, 0x15, 0xe7, 0xec, 0x1e, 0x39, 0xcb, 0xc0, 0x32, 0xd6, 0x24, 0x2f, 0xdd, 0x61, 0x93, 0x98, 0x6a, 0x8e, 0x7c, 0x77, 0x85, 0xa2, 0x50, 0x5b, 0xa9, 0x4d, 0xbf, 0xb4, 0x46, 0xd1, 0x23, 0x28, 0xda, 0x3e, 0xcc, 0xc7, 0x35, 0x12, 0xe0, 0xeb, 0x19, 0xfd, 0xf, 0x4, 0xf6, 0x4a, 0xb8, 0xb3, 0x41, 0xa5, 0x57, 0x5c, 0xae, 0x89, 0x7b, 0x70, 0x82, 0x66, 0x94, 0x9f, 0x6d}, + {0x0, 0xf3, 0xfb, 0x8, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28, 0x8b, 0x78, 0x70, 0x83, 0x60, 0x93, 0x9b, 0x68, 0x40, 0xb3, 0xbb, 0x48, 0xab, 0x58, 0x50, 0xa3, 0xb, 0xf8, 0xf0, 0x3, 0xe0, 0x13, 0x1b, 0xe8, 0xc0, 0x33, 0x3b, 0xc8, 0x2b, 0xd8, 0xd0, 0x23, 0x80, 0x73, 0x7b, 0x88, 0x6b, 0x98, 0x90, 0x63, 0x4b, 0xb8, 0xb0, 0x43, 0xa0, 0x53, 0x5b, 0xa8, 0x16, 0xe5, 0xed, 0x1e, 0xfd, 0xe, 0x6, 0xf5, 0xdd, 0x2e, 0x26, 0xd5, 0x36, 0xc5, 0xcd, 0x3e, 0x9d, 0x6e, 0x66, 0x95, 0x76, 0x85, 0x8d, 0x7e, 0x56, 0xa5, 0xad, 0x5e, 0xbd, 0x4e, 0x46, 0xb5, 0x1d, 0xee, 0xe6, 0x15, 0xf6, 0x5, 0xd, 0xfe, 0xd6, 0x25, 0x2d, 0xde, 0x3d, 0xce, 0xc6, 0x35, 0x96, 0x65, 0x6d, 0x9e, 0x7d, 0x8e, 0x86, 0x75, 0x5d, 0xae, 0xa6, 0x55, 0xb6, 0x45, 0x4d, 0xbe, 0x2c, 0xdf, 0xd7, 0x24, 0xc7, 0x34, 0x3c, 0xcf, 0xe7, 0x14, 0x1c, 0xef, 0xc, 0xff, 0xf7, 0x4, 0xa7, 0x54, 0x5c, 0xaf, 0x4c, 0xbf, 0xb7, 0x44, 0x6c, 0x9f, 0x97, 0x64, 0x87, 0x74, 0x7c, 0x8f, 0x27, 0xd4, 0xdc, 0x2f, 0xcc, 0x3f, 0x37, 0xc4, 0xec, 0x1f, 0x17, 0xe4, 0x7, 0xf4, 0xfc, 0xf, 0xac, 0x5f, 0x57, 0xa4, 0x47, 0xb4, 0xbc, 0x4f, 0x67, 0x94, 0x9c, 0x6f, 0x8c, 0x7f, 0x77, 0x84, 0x3a, 0xc9, 0xc1, 0x32, 0xd1, 0x22, 0x2a, 0xd9, 0xf1, 0x2, 0xa, 0xf9, 0x1a, 0xe9, 0xe1, 0x12, 0xb1, 0x42, 0x4a, 0xb9, 0x5a, 0xa9, 0xa1, 0x52, 0x7a, 0x89, 0x81, 0x72, 0x91, 0x62, 0x6a, 0x99, 0x31, 0xc2, 0xca, 0x39, 0xda, 0x29, 0x21, 0xd2, 0xfa, 0x9, 0x1, 0xf2, 0x11, 0xe2, 0xea, 0x19, 0xba, 0x49, 0x41, 0xb2, 0x51, 0xa2, 0xaa, 0x59, 0x71, 0x82, 0x8a, 0x79, 0x9a, 0x69, 0x61, 0x92}, + {0x0, 0xf4, 0xf5, 0x1, 0xf7, 0x3, 0x2, 0xf6, 0xf3, 0x7, 0x6, 0xf2, 0x4, 0xf0, 0xf1, 0x5, 0xfb, 0xf, 0xe, 0xfa, 0xc, 0xf8, 0xf9, 0xd, 0x8, 0xfc, 0xfd, 0x9, 0xff, 0xb, 0xa, 0xfe, 0xeb, 0x1f, 0x1e, 0xea, 0x1c, 0xe8, 0xe9, 0x1d, 0x18, 0xec, 0xed, 0x19, 0xef, 0x1b, 0x1a, 0xee, 0x10, 0xe4, 0xe5, 0x11, 0xe7, 0x13, 0x12, 0xe6, 0xe3, 0x17, 0x16, 0xe2, 0x14, 0xe0, 0xe1, 0x15, 0xcb, 0x3f, 0x3e, 0xca, 0x3c, 0xc8, 0xc9, 0x3d, 0x38, 0xcc, 0xcd, 0x39, 0xcf, 0x3b, 0x3a, 0xce, 0x30, 0xc4, 0xc5, 0x31, 0xc7, 0x33, 0x32, 0xc6, 0xc3, 0x37, 0x36, 0xc2, 0x34, 0xc0, 0xc1, 0x35, 0x20, 0xd4, 0xd5, 0x21, 0xd7, 0x23, 0x22, 0xd6, 0xd3, 0x27, 0x26, 0xd2, 0x24, 0xd0, 0xd1, 0x25, 0xdb, 0x2f, 0x2e, 0xda, 0x2c, 0xd8, 0xd9, 0x2d, 0x28, 0xdc, 0xdd, 0x29, 0xdf, 0x2b, 0x2a, 0xde, 0x8b, 0x7f, 0x7e, 0x8a, 0x7c, 0x88, 0x89, 0x7d, 0x78, 0x8c, 0x8d, 0x79, 0x8f, 0x7b, 0x7a, 0x8e, 0x70, 0x84, 0x85, 0x71, 0x87, 0x73, 0x72, 0x86, 0x83, 0x77, 0x76, 0x82, 0x74, 0x80, 0x81, 0x75, 0x60, 0x94, 0x95, 0x61, 0x97, 0x63, 0x62, 0x96, 0x93, 0x67, 0x66, 0x92, 0x64, 0x90, 0x91, 0x65, 0x9b, 0x6f, 0x6e, 0x9a, 0x6c, 0x98, 0x99, 0x6d, 0x68, 0x9c, 0x9d, 0x69, 0x9f, 0x6b, 0x6a, 0x9e, 0x40, 0xb4, 0xb5, 0x41, 0xb7, 0x43, 0x42, 0xb6, 0xb3, 0x47, 0x46, 0xb2, 0x44, 0xb0, 0xb1, 0x45, 0xbb, 0x4f, 0x4e, 0xba, 0x4c, 0xb8, 0xb9, 0x4d, 0x48, 0xbc, 0xbd, 0x49, 0xbf, 0x4b, 0x4a, 0xbe, 0xab, 0x5f, 0x5e, 0xaa, 0x5c, 0xa8, 0xa9, 0x5d, 0x58, 0xac, 0xad, 0x59, 0xaf, 0x5b, 0x5a, 0xae, 0x50, 0xa4, 0xa5, 0x51, 0xa7, 0x53, 0x52, 0xa6, 0xa3, 0x57, 0x56, 0xa2, 0x54, 0xa0, 0xa1, 0x55}, + {0x0, 0xf5, 0xf7, 0x2, 0xf3, 0x6, 0x4, 0xf1, 0xfb, 0xe, 0xc, 0xf9, 0x8, 0xfd, 0xff, 0xa, 0xeb, 0x1e, 0x1c, 0xe9, 0x18, 0xed, 0xef, 0x1a, 0x10, 0xe5, 0xe7, 0x12, 0xe3, 0x16, 0x14, 0xe1, 0xcb, 0x3e, 0x3c, 0xc9, 0x38, 0xcd, 0xcf, 0x3a, 0x30, 0xc5, 0xc7, 0x32, 0xc3, 0x36, 0x34, 0xc1, 0x20, 0xd5, 0xd7, 0x22, 0xd3, 0x26, 0x24, 0xd1, 0xdb, 0x2e, 0x2c, 0xd9, 0x28, 0xdd, 0xdf, 0x2a, 0x8b, 0x7e, 0x7c, 0x89, 0x78, 0x8d, 0x8f, 0x7a, 0x70, 0x85, 0x87, 0x72, 0x83, 0x76, 0x74, 0x81, 0x60, 0x95, 0x97, 0x62, 0x93, 0x66, 0x64, 0x91, 0x9b, 0x6e, 0x6c, 0x99, 0x68, 0x9d, 0x9f, 0x6a, 0x40, 0xb5, 0xb7, 0x42, 0xb3, 0x46, 0x44, 0xb1, 0xbb, 0x4e, 0x4c, 0xb9, 0x48, 0xbd, 0xbf, 0x4a, 0xab, 0x5e, 0x5c, 0xa9, 0x58, 0xad, 0xaf, 0x5a, 0x50, 0xa5, 0xa7, 0x52, 0xa3, 0x56, 0x54, 0xa1, 0xb, 0xfe, 0xfc, 0x9, 0xf8, 0xd, 0xf, 0xfa, 0xf0, 0x5, 0x7, 0xf2, 0x3, 0xf6, 0xf4, 0x1, 0xe0, 0x15, 0x17, 0xe2, 0x13, 0xe6, 0xe4, 0x11, 0x1b, 0xee, 0xec, 0x19, 0xe8, 0x1d, 0x1f, 0xea, 0xc0, 0x35, 0x37, 0xc2, 0x33, 0xc6, 0xc4, 0x31, 0x3b, 0xce, 0xcc, 0x39, 0xc8, 0x3d, 0x3f, 0xca, 0x2b, 0xde, 0xdc, 0x29, 0xd8, 0x2d, 0x2f, 0xda, 0xd0, 0x25, 0x27, 0xd2, 0x23, 0xd6, 0xd4, 0x21, 0x80, 0x75, 0x77, 0x82, 0x73, 0x86, 0x84, 0x71, 0x7b, 0x8e, 0x8c, 0x79, 0x88, 0x7d, 0x7f, 0x8a, 0x6b, 0x9e, 0x9c, 0x69, 0x98, 0x6d, 0x6f, 0x9a, 0x90, 0x65, 0x67, 0x92, 0x63, 0x96, 0x94, 0x61, 0x4b, 0xbe, 0xbc, 0x49, 0xb8, 0x4d, 0x4f, 0xba, 0xb0, 0x45, 0x47, 0xb2, 0x43, 0xb6, 0xb4, 0x41, 0xa0, 0x55, 0x57, 0xa2, 0x53, 0xa6, 0xa4, 0x51, 0x5b, 0xae, 0xac, 0x59, 0xa8, 0x5d, 0x5f, 0xaa}, + {0x0, 0xf6, 0xf1, 0x7, 0xff, 0x9, 0xe, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b, 0xdb, 0x2d, 0x2a, 0xdc, 0x24, 0xd2, 0xd5, 0x23, 0x38, 0xce, 0xc9, 0x3f, 0xc7, 0x31, 0x36, 0xc0, 0xab, 0x5d, 0x5a, 0xac, 0x54, 0xa2, 0xa5, 0x53, 0x48, 0xbe, 0xb9, 0x4f, 0xb7, 0x41, 0x46, 0xb0, 0x70, 0x86, 0x81, 0x77, 0x8f, 0x79, 0x7e, 0x88, 0x93, 0x65, 0x62, 0x94, 0x6c, 0x9a, 0x9d, 0x6b, 0x4b, 0xbd, 0xba, 0x4c, 0xb4, 0x42, 0x45, 0xb3, 0xa8, 0x5e, 0x59, 0xaf, 0x57, 0xa1, 0xa6, 0x50, 0x90, 0x66, 0x61, 0x97, 0x6f, 0x99, 0x9e, 0x68, 0x73, 0x85, 0x82, 0x74, 0x8c, 0x7a, 0x7d, 0x8b, 0xe0, 0x16, 0x11, 0xe7, 0x1f, 0xe9, 0xee, 0x18, 0x3, 0xf5, 0xf2, 0x4, 0xfc, 0xa, 0xd, 0xfb, 0x3b, 0xcd, 0xca, 0x3c, 0xc4, 0x32, 0x35, 0xc3, 0xd8, 0x2e, 0x29, 0xdf, 0x27, 0xd1, 0xd6, 0x20, 0x96, 0x60, 0x67, 0x91, 0x69, 0x9f, 0x98, 0x6e, 0x75, 0x83, 0x84, 0x72, 0x8a, 0x7c, 0x7b, 0x8d, 0x4d, 0xbb, 0xbc, 0x4a, 0xb2, 0x44, 0x43, 0xb5, 0xae, 0x58, 0x5f, 0xa9, 0x51, 0xa7, 0xa0, 0x56, 0x3d, 0xcb, 0xcc, 0x3a, 0xc2, 0x34, 0x33, 0xc5, 0xde, 0x28, 0x2f, 0xd9, 0x21, 0xd7, 0xd0, 0x26, 0xe6, 0x10, 0x17, 0xe1, 0x19, 0xef, 0xe8, 0x1e, 0x5, 0xf3, 0xf4, 0x2, 0xfa, 0xc, 0xb, 0xfd, 0xdd, 0x2b, 0x2c, 0xda, 0x22, 0xd4, 0xd3, 0x25, 0x3e, 0xc8, 0xcf, 0x39, 0xc1, 0x37, 0x30, 0xc6, 0x6, 0xf0, 0xf7, 0x1, 0xf9, 0xf, 0x8, 0xfe, 0xe5, 0x13, 0x14, 0xe2, 0x1a, 0xec, 0xeb, 0x1d, 0x76, 0x80, 0x87, 0x71, 0x89, 0x7f, 0x78, 0x8e, 0x95, 0x63, 0x64, 0x92, 0x6a, 0x9c, 0x9b, 0x6d, 0xad, 0x5b, 0x5c, 0xaa, 0x52, 0xa4, 0xa3, 0x55, 0x4e, 0xb8, 0xbf, 0x49, 0xb1, 0x47, 0x40, 0xb6}, + {0x0, 0xf7, 0xf3, 0x4, 0xfb, 0xc, 0x8, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14, 0xcb, 0x3c, 0x38, 0xcf, 0x30, 0xc7, 0xc3, 0x34, 0x20, 0xd7, 0xd3, 0x24, 0xdb, 0x2c, 0x28, 0xdf, 0x8b, 0x7c, 0x78, 0x8f, 0x70, 0x87, 0x83, 0x74, 0x60, 0x97, 0x93, 0x64, 0x9b, 0x6c, 0x68, 0x9f, 0x40, 0xb7, 0xb3, 0x44, 0xbb, 0x4c, 0x48, 0xbf, 0xab, 0x5c, 0x58, 0xaf, 0x50, 0xa7, 0xa3, 0x54, 0xb, 0xfc, 0xf8, 0xf, 0xf0, 0x7, 0x3, 0xf4, 0xe0, 0x17, 0x13, 0xe4, 0x1b, 0xec, 0xe8, 0x1f, 0xc0, 0x37, 0x33, 0xc4, 0x3b, 0xcc, 0xc8, 0x3f, 0x2b, 0xdc, 0xd8, 0x2f, 0xd0, 0x27, 0x23, 0xd4, 0x80, 0x77, 0x73, 0x84, 0x7b, 0x8c, 0x88, 0x7f, 0x6b, 0x9c, 0x98, 0x6f, 0x90, 0x67, 0x63, 0x94, 0x4b, 0xbc, 0xb8, 0x4f, 0xb0, 0x47, 0x43, 0xb4, 0xa0, 0x57, 0x53, 0xa4, 0x5b, 0xac, 0xa8, 0x5f, 0x16, 0xe1, 0xe5, 0x12, 0xed, 0x1a, 0x1e, 0xe9, 0xfd, 0xa, 0xe, 0xf9, 0x6, 0xf1, 0xf5, 0x2, 0xdd, 0x2a, 0x2e, 0xd9, 0x26, 0xd1, 0xd5, 0x22, 0x36, 0xc1, 0xc5, 0x32, 0xcd, 0x3a, 0x3e, 0xc9, 0x9d, 0x6a, 0x6e, 0x99, 0x66, 0x91, 0x95, 0x62, 0x76, 0x81, 0x85, 0x72, 0x8d, 0x7a, 0x7e, 0x89, 0x56, 0xa1, 0xa5, 0x52, 0xad, 0x5a, 0x5e, 0xa9, 0xbd, 0x4a, 0x4e, 0xb9, 0x46, 0xb1, 0xb5, 0x42, 0x1d, 0xea, 0xee, 0x19, 0xe6, 0x11, 0x15, 0xe2, 0xf6, 0x1, 0x5, 0xf2, 0xd, 0xfa, 0xfe, 0x9, 0xd6, 0x21, 0x25, 0xd2, 0x2d, 0xda, 0xde, 0x29, 0x3d, 0xca, 0xce, 0x39, 0xc6, 0x31, 0x35, 0xc2, 0x96, 0x61, 0x65, 0x92, 0x6d, 0x9a, 0x9e, 0x69, 0x7d, 0x8a, 0x8e, 0x79, 0x86, 0x71, 0x75, 0x82, 0x5d, 0xaa, 0xae, 0x59, 0xa6, 0x51, 0x55, 0xa2, 0xb6, 0x41, 0x45, 0xb2, 0x4d, 0xba, 0xbe, 0x49}, + {0x0, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41, 0x3b, 0xc3, 0xd6, 0x2e, 0xfc, 0x4, 0x11, 0xe9, 0xa8, 0x50, 0x45, 0xbd, 0x6f, 0x97, 0x82, 0x7a, 0x76, 0x8e, 0x9b, 0x63, 0xb1, 0x49, 0x5c, 0xa4, 0xe5, 0x1d, 0x8, 0xf0, 0x22, 0xda, 0xcf, 0x37, 0x4d, 0xb5, 0xa0, 0x58, 0x8a, 0x72, 0x67, 0x9f, 0xde, 0x26, 0x33, 0xcb, 0x19, 0xe1, 0xf4, 0xc, 0xec, 0x14, 0x1, 0xf9, 0x2b, 0xd3, 0xc6, 0x3e, 0x7f, 0x87, 0x92, 0x6a, 0xb8, 0x40, 0x55, 0xad, 0xd7, 0x2f, 0x3a, 0xc2, 0x10, 0xe8, 0xfd, 0x5, 0x44, 0xbc, 0xa9, 0x51, 0x83, 0x7b, 0x6e, 0x96, 0x9a, 0x62, 0x77, 0x8f, 0x5d, 0xa5, 0xb0, 0x48, 0x9, 0xf1, 0xe4, 0x1c, 0xce, 0x36, 0x23, 0xdb, 0xa1, 0x59, 0x4c, 0xb4, 0x66, 0x9e, 0x8b, 0x73, 0x32, 0xca, 0xdf, 0x27, 0xf5, 0xd, 0x18, 0xe0, 0xc5, 0x3d, 0x28, 0xd0, 0x2, 0xfa, 0xef, 0x17, 0x56, 0xae, 0xbb, 0x43, 0x91, 0x69, 0x7c, 0x84, 0xfe, 0x6, 0x13, 0xeb, 0x39, 0xc1, 0xd4, 0x2c, 0x6d, 0x95, 0x80, 0x78, 0xaa, 0x52, 0x47, 0xbf, 0xb3, 0x4b, 0x5e, 0xa6, 0x74, 0x8c, 0x99, 0x61, 0x20, 0xd8, 0xcd, 0x35, 0xe7, 0x1f, 0xa, 0xf2, 0x88, 0x70, 0x65, 0x9d, 0x4f, 0xb7, 0xa2, 0x5a, 0x1b, 0xe3, 0xf6, 0xe, 0xdc, 0x24, 0x31, 0xc9, 0x29, 0xd1, 0xc4, 0x3c, 0xee, 0x16, 0x3, 0xfb, 0xba, 0x42, 0x57, 0xaf, 0x7d, 0x85, 0x90, 0x68, 0x12, 0xea, 0xff, 0x7, 0xd5, 0x2d, 0x38, 0xc0, 0x81, 0x79, 0x6c, 0x94, 0x46, 0xbe, 0xab, 0x53, 0x5f, 0xa7, 0xb2, 0x4a, 0x98, 0x60, 0x75, 0x8d, 0xcc, 0x34, 0x21, 0xd9, 0xb, 0xf3, 0xe6, 0x1e, 0x64, 0x9c, 0x89, 0x71, 0xa3, 0x5b, 0x4e, 0xb6, 0xf7, 0xf, 0x1a, 0xe2, 0x30, 0xc8, 0xdd, 0x25}, + {0x0, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e, 0x2b, 0xd2, 0xc4, 0x3d, 0xe8, 0x11, 0x7, 0xfe, 0xb0, 0x49, 0x5f, 0xa6, 0x73, 0x8a, 0x9c, 0x65, 0x56, 0xaf, 0xb9, 0x40, 0x95, 0x6c, 0x7a, 0x83, 0xcd, 0x34, 0x22, 0xdb, 0xe, 0xf7, 0xe1, 0x18, 0x7d, 0x84, 0x92, 0x6b, 0xbe, 0x47, 0x51, 0xa8, 0xe6, 0x1f, 0x9, 0xf0, 0x25, 0xdc, 0xca, 0x33, 0xac, 0x55, 0x43, 0xba, 0x6f, 0x96, 0x80, 0x79, 0x37, 0xce, 0xd8, 0x21, 0xf4, 0xd, 0x1b, 0xe2, 0x87, 0x7e, 0x68, 0x91, 0x44, 0xbd, 0xab, 0x52, 0x1c, 0xe5, 0xf3, 0xa, 0xdf, 0x26, 0x30, 0xc9, 0xfa, 0x3, 0x15, 0xec, 0x39, 0xc0, 0xd6, 0x2f, 0x61, 0x98, 0x8e, 0x77, 0xa2, 0x5b, 0x4d, 0xb4, 0xd1, 0x28, 0x3e, 0xc7, 0x12, 0xeb, 0xfd, 0x4, 0x4a, 0xb3, 0xa5, 0x5c, 0x89, 0x70, 0x66, 0x9f, 0x45, 0xbc, 0xaa, 0x53, 0x86, 0x7f, 0x69, 0x90, 0xde, 0x27, 0x31, 0xc8, 0x1d, 0xe4, 0xf2, 0xb, 0x6e, 0x97, 0x81, 0x78, 0xad, 0x54, 0x42, 0xbb, 0xf5, 0xc, 0x1a, 0xe3, 0x36, 0xcf, 0xd9, 0x20, 0x13, 0xea, 0xfc, 0x5, 0xd0, 0x29, 0x3f, 0xc6, 0x88, 0x71, 0x67, 0x9e, 0x4b, 0xb2, 0xa4, 0x5d, 0x38, 0xc1, 0xd7, 0x2e, 0xfb, 0x2, 0x14, 0xed, 0xa3, 0x5a, 0x4c, 0xb5, 0x60, 0x99, 0x8f, 0x76, 0xe9, 0x10, 0x6, 0xff, 0x2a, 0xd3, 0xc5, 0x3c, 0x72, 0x8b, 0x9d, 0x64, 0xb1, 0x48, 0x5e, 0xa7, 0xc2, 0x3b, 0x2d, 0xd4, 0x1, 0xf8, 0xee, 0x17, 0x59, 0xa0, 0xb6, 0x4f, 0x9a, 0x63, 0x75, 0x8c, 0xbf, 0x46, 0x50, 0xa9, 0x7c, 0x85, 0x93, 0x6a, 0x24, 0xdd, 0xcb, 0x32, 0xe7, 0x1e, 0x8, 0xf1, 0x94, 0x6d, 0x7b, 0x82, 0x57, 0xae, 0xb8, 0x41, 0xf, 0xf6, 0xe0, 0x19, 0xcc, 0x35, 0x23, 0xda}, + {0x0, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f, 0x1b, 0xe1, 0xf2, 0x8, 0xd4, 0x2e, 0x3d, 0xc7, 0x98, 0x62, 0x71, 0x8b, 0x57, 0xad, 0xbe, 0x44, 0x36, 0xcc, 0xdf, 0x25, 0xf9, 0x3, 0x10, 0xea, 0xb5, 0x4f, 0x5c, 0xa6, 0x7a, 0x80, 0x93, 0x69, 0x2d, 0xd7, 0xc4, 0x3e, 0xe2, 0x18, 0xb, 0xf1, 0xae, 0x54, 0x47, 0xbd, 0x61, 0x9b, 0x88, 0x72, 0x6c, 0x96, 0x85, 0x7f, 0xa3, 0x59, 0x4a, 0xb0, 0xef, 0x15, 0x6, 0xfc, 0x20, 0xda, 0xc9, 0x33, 0x77, 0x8d, 0x9e, 0x64, 0xb8, 0x42, 0x51, 0xab, 0xf4, 0xe, 0x1d, 0xe7, 0x3b, 0xc1, 0xd2, 0x28, 0x5a, 0xa0, 0xb3, 0x49, 0x95, 0x6f, 0x7c, 0x86, 0xd9, 0x23, 0x30, 0xca, 0x16, 0xec, 0xff, 0x5, 0x41, 0xbb, 0xa8, 0x52, 0x8e, 0x74, 0x67, 0x9d, 0xc2, 0x38, 0x2b, 0xd1, 0xd, 0xf7, 0xe4, 0x1e, 0xd8, 0x22, 0x31, 0xcb, 0x17, 0xed, 0xfe, 0x4, 0x5b, 0xa1, 0xb2, 0x48, 0x94, 0x6e, 0x7d, 0x87, 0xc3, 0x39, 0x2a, 0xd0, 0xc, 0xf6, 0xe5, 0x1f, 0x40, 0xba, 0xa9, 0x53, 0x8f, 0x75, 0x66, 0x9c, 0xee, 0x14, 0x7, 0xfd, 0x21, 0xdb, 0xc8, 0x32, 0x6d, 0x97, 0x84, 0x7e, 0xa2, 0x58, 0x4b, 0xb1, 0xf5, 0xf, 0x1c, 0xe6, 0x3a, 0xc0, 0xd3, 0x29, 0x76, 0x8c, 0x9f, 0x65, 0xb9, 0x43, 0x50, 0xaa, 0xb4, 0x4e, 0x5d, 0xa7, 0x7b, 0x81, 0x92, 0x68, 0x37, 0xcd, 0xde, 0x24, 0xf8, 0x2, 0x11, 0xeb, 0xaf, 0x55, 0x46, 0xbc, 0x60, 0x9a, 0x89, 0x73, 0x2c, 0xd6, 0xc5, 0x3f, 0xe3, 0x19, 0xa, 0xf0, 0x82, 0x78, 0x6b, 0x91, 0x4d, 0xb7, 0xa4, 0x5e, 0x1, 0xfb, 0xe8, 0x12, 0xce, 0x34, 0x27, 0xdd, 0x99, 0x63, 0x70, 0x8a, 0x56, 0xac, 0xbf, 0x45, 0x1a, 0xe0, 0xf3, 0x9, 0xd5, 0x2f, 0x3c, 0xc6}, + {0x0, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50, 0xb, 0xf0, 0xe0, 0x1b, 0xc0, 0x3b, 0x2b, 0xd0, 0x80, 0x7b, 0x6b, 0x90, 0x4b, 0xb0, 0xa0, 0x5b, 0x16, 0xed, 0xfd, 0x6, 0xdd, 0x26, 0x36, 0xcd, 0x9d, 0x66, 0x76, 0x8d, 0x56, 0xad, 0xbd, 0x46, 0x1d, 0xe6, 0xf6, 0xd, 0xd6, 0x2d, 0x3d, 0xc6, 0x96, 0x6d, 0x7d, 0x86, 0x5d, 0xa6, 0xb6, 0x4d, 0x2c, 0xd7, 0xc7, 0x3c, 0xe7, 0x1c, 0xc, 0xf7, 0xa7, 0x5c, 0x4c, 0xb7, 0x6c, 0x97, 0x87, 0x7c, 0x27, 0xdc, 0xcc, 0x37, 0xec, 0x17, 0x7, 0xfc, 0xac, 0x57, 0x47, 0xbc, 0x67, 0x9c, 0x8c, 0x77, 0x3a, 0xc1, 0xd1, 0x2a, 0xf1, 0xa, 0x1a, 0xe1, 0xb1, 0x4a, 0x5a, 0xa1, 0x7a, 0x81, 0x91, 0x6a, 0x31, 0xca, 0xda, 0x21, 0xfa, 0x1, 0x11, 0xea, 0xba, 0x41, 0x51, 0xaa, 0x71, 0x8a, 0x9a, 0x61, 0x58, 0xa3, 0xb3, 0x48, 0x93, 0x68, 0x78, 0x83, 0xd3, 0x28, 0x38, 0xc3, 0x18, 0xe3, 0xf3, 0x8, 0x53, 0xa8, 0xb8, 0x43, 0x98, 0x63, 0x73, 0x88, 0xd8, 0x23, 0x33, 0xc8, 0x13, 0xe8, 0xf8, 0x3, 0x4e, 0xb5, 0xa5, 0x5e, 0x85, 0x7e, 0x6e, 0x95, 0xc5, 0x3e, 0x2e, 0xd5, 0xe, 0xf5, 0xe5, 0x1e, 0x45, 0xbe, 0xae, 0x55, 0x8e, 0x75, 0x65, 0x9e, 0xce, 0x35, 0x25, 0xde, 0x5, 0xfe, 0xee, 0x15, 0x74, 0x8f, 0x9f, 0x64, 0xbf, 0x44, 0x54, 0xaf, 0xff, 0x4, 0x14, 0xef, 0x34, 0xcf, 0xdf, 0x24, 0x7f, 0x84, 0x94, 0x6f, 0xb4, 0x4f, 0x5f, 0xa4, 0xf4, 0xf, 0x1f, 0xe4, 0x3f, 0xc4, 0xd4, 0x2f, 0x62, 0x99, 0x89, 0x72, 0xa9, 0x52, 0x42, 0xb9, 0xe9, 0x12, 0x2, 0xf9, 0x22, 0xd9, 0xc9, 0x32, 0x69, 0x92, 0x82, 0x79, 0xa2, 0x59, 0x49, 0xb2, 0xe2, 0x19, 0x9, 0xf2, 0x29, 0xd2, 0xc2, 0x39}, + {0x0, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d, 0x7b, 0x87, 0x9e, 0x62, 0xac, 0x50, 0x49, 0xb5, 0xc8, 0x34, 0x2d, 0xd1, 0x1f, 0xe3, 0xfa, 0x6, 0xf6, 0xa, 0x13, 0xef, 0x21, 0xdd, 0xc4, 0x38, 0x45, 0xb9, 0xa0, 0x5c, 0x92, 0x6e, 0x77, 0x8b, 0x8d, 0x71, 0x68, 0x94, 0x5a, 0xa6, 0xbf, 0x43, 0x3e, 0xc2, 0xdb, 0x27, 0xe9, 0x15, 0xc, 0xf0, 0xf1, 0xd, 0x14, 0xe8, 0x26, 0xda, 0xc3, 0x3f, 0x42, 0xbe, 0xa7, 0x5b, 0x95, 0x69, 0x70, 0x8c, 0x8a, 0x76, 0x6f, 0x93, 0x5d, 0xa1, 0xb8, 0x44, 0x39, 0xc5, 0xdc, 0x20, 0xee, 0x12, 0xb, 0xf7, 0x7, 0xfb, 0xe2, 0x1e, 0xd0, 0x2c, 0x35, 0xc9, 0xb4, 0x48, 0x51, 0xad, 0x63, 0x9f, 0x86, 0x7a, 0x7c, 0x80, 0x99, 0x65, 0xab, 0x57, 0x4e, 0xb2, 0xcf, 0x33, 0x2a, 0xd6, 0x18, 0xe4, 0xfd, 0x1, 0xff, 0x3, 0x1a, 0xe6, 0x28, 0xd4, 0xcd, 0x31, 0x4c, 0xb0, 0xa9, 0x55, 0x9b, 0x67, 0x7e, 0x82, 0x84, 0x78, 0x61, 0x9d, 0x53, 0xaf, 0xb6, 0x4a, 0x37, 0xcb, 0xd2, 0x2e, 0xe0, 0x1c, 0x5, 0xf9, 0x9, 0xf5, 0xec, 0x10, 0xde, 0x22, 0x3b, 0xc7, 0xba, 0x46, 0x5f, 0xa3, 0x6d, 0x91, 0x88, 0x74, 0x72, 0x8e, 0x97, 0x6b, 0xa5, 0x59, 0x40, 0xbc, 0xc1, 0x3d, 0x24, 0xd8, 0x16, 0xea, 0xf3, 0xf, 0xe, 0xf2, 0xeb, 0x17, 0xd9, 0x25, 0x3c, 0xc0, 0xbd, 0x41, 0x58, 0xa4, 0x6a, 0x96, 0x8f, 0x73, 0x75, 0x89, 0x90, 0x6c, 0xa2, 0x5e, 0x47, 0xbb, 0xc6, 0x3a, 0x23, 0xdf, 0x11, 0xed, 0xf4, 0x8, 0xf8, 0x4, 0x1d, 0xe1, 0x2f, 0xd3, 0xca, 0x36, 0x4b, 0xb7, 0xae, 0x52, 0x9c, 0x60, 0x79, 0x85, 0x83, 0x7f, 0x66, 0x9a, 0x54, 0xa8, 0xb1, 0x4d, 0x30, 0xcc, 0xd5, 0x29, 0xe7, 0x1b, 0x2, 0xfe}, + {0x0, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72, 0x6b, 0x96, 0x8c, 0x71, 0xb8, 0x45, 0x5f, 0xa2, 0xd0, 0x2d, 0x37, 0xca, 0x3, 0xfe, 0xe4, 0x19, 0xd6, 0x2b, 0x31, 0xcc, 0x5, 0xf8, 0xe2, 0x1f, 0x6d, 0x90, 0x8a, 0x77, 0xbe, 0x43, 0x59, 0xa4, 0xbd, 0x40, 0x5a, 0xa7, 0x6e, 0x93, 0x89, 0x74, 0x6, 0xfb, 0xe1, 0x1c, 0xd5, 0x28, 0x32, 0xcf, 0xb1, 0x4c, 0x56, 0xab, 0x62, 0x9f, 0x85, 0x78, 0xa, 0xf7, 0xed, 0x10, 0xd9, 0x24, 0x3e, 0xc3, 0xda, 0x27, 0x3d, 0xc0, 0x9, 0xf4, 0xee, 0x13, 0x61, 0x9c, 0x86, 0x7b, 0xb2, 0x4f, 0x55, 0xa8, 0x67, 0x9a, 0x80, 0x7d, 0xb4, 0x49, 0x53, 0xae, 0xdc, 0x21, 0x3b, 0xc6, 0xf, 0xf2, 0xe8, 0x15, 0xc, 0xf1, 0xeb, 0x16, 0xdf, 0x22, 0x38, 0xc5, 0xb7, 0x4a, 0x50, 0xad, 0x64, 0x99, 0x83, 0x7e, 0x7f, 0x82, 0x98, 0x65, 0xac, 0x51, 0x4b, 0xb6, 0xc4, 0x39, 0x23, 0xde, 0x17, 0xea, 0xf0, 0xd, 0x14, 0xe9, 0xf3, 0xe, 0xc7, 0x3a, 0x20, 0xdd, 0xaf, 0x52, 0x48, 0xb5, 0x7c, 0x81, 0x9b, 0x66, 0xa9, 0x54, 0x4e, 0xb3, 0x7a, 0x87, 0x9d, 0x60, 0x12, 0xef, 0xf5, 0x8, 0xc1, 0x3c, 0x26, 0xdb, 0xc2, 0x3f, 0x25, 0xd8, 0x11, 0xec, 0xf6, 0xb, 0x79, 0x84, 0x9e, 0x63, 0xaa, 0x57, 0x4d, 0xb0, 0xce, 0x33, 0x29, 0xd4, 0x1d, 0xe0, 0xfa, 0x7, 0x75, 0x88, 0x92, 0x6f, 0xa6, 0x5b, 0x41, 0xbc, 0xa5, 0x58, 0x42, 0xbf, 0x76, 0x8b, 0x91, 0x6c, 0x1e, 0xe3, 0xf9, 0x4, 0xcd, 0x30, 0x2a, 0xd7, 0x18, 0xe5, 0xff, 0x2, 0xcb, 0x36, 0x2c, 0xd1, 0xa3, 0x5e, 0x44, 0xb9, 0x70, 0x8d, 0x97, 0x6a, 0x73, 0x8e, 0x94, 0x69, 0xa0, 0x5d, 0x47, 0xba, 0xc8, 0x35, 0x2f, 0xd2, 0x1b, 0xe6, 0xfc, 0x1}, + {0x0, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63, 0x5b, 0xa5, 0xba, 0x44, 0x84, 0x7a, 0x65, 0x9b, 0xf8, 0x6, 0x19, 0xe7, 0x27, 0xd9, 0xc6, 0x38, 0xb6, 0x48, 0x57, 0xa9, 0x69, 0x97, 0x88, 0x76, 0x15, 0xeb, 0xf4, 0xa, 0xca, 0x34, 0x2b, 0xd5, 0xed, 0x13, 0xc, 0xf2, 0x32, 0xcc, 0xd3, 0x2d, 0x4e, 0xb0, 0xaf, 0x51, 0x91, 0x6f, 0x70, 0x8e, 0x71, 0x8f, 0x90, 0x6e, 0xae, 0x50, 0x4f, 0xb1, 0xd2, 0x2c, 0x33, 0xcd, 0xd, 0xf3, 0xec, 0x12, 0x2a, 0xd4, 0xcb, 0x35, 0xf5, 0xb, 0x14, 0xea, 0x89, 0x77, 0x68, 0x96, 0x56, 0xa8, 0xb7, 0x49, 0xc7, 0x39, 0x26, 0xd8, 0x18, 0xe6, 0xf9, 0x7, 0x64, 0x9a, 0x85, 0x7b, 0xbb, 0x45, 0x5a, 0xa4, 0x9c, 0x62, 0x7d, 0x83, 0x43, 0xbd, 0xa2, 0x5c, 0x3f, 0xc1, 0xde, 0x20, 0xe0, 0x1e, 0x1, 0xff, 0xe2, 0x1c, 0x3, 0xfd, 0x3d, 0xc3, 0xdc, 0x22, 0x41, 0xbf, 0xa0, 0x5e, 0x9e, 0x60, 0x7f, 0x81, 0xb9, 0x47, 0x58, 0xa6, 0x66, 0x98, 0x87, 0x79, 0x1a, 0xe4, 0xfb, 0x5, 0xc5, 0x3b, 0x24, 0xda, 0x54, 0xaa, 0xb5, 0x4b, 0x8b, 0x75, 0x6a, 0x94, 0xf7, 0x9, 0x16, 0xe8, 0x28, 0xd6, 0xc9, 0x37, 0xf, 0xf1, 0xee, 0x10, 0xd0, 0x2e, 0x31, 0xcf, 0xac, 0x52, 0x4d, 0xb3, 0x73, 0x8d, 0x92, 0x6c, 0x93, 0x6d, 0x72, 0x8c, 0x4c, 0xb2, 0xad, 0x53, 0x30, 0xce, 0xd1, 0x2f, 0xef, 0x11, 0xe, 0xf0, 0xc8, 0x36, 0x29, 0xd7, 0x17, 0xe9, 0xf6, 0x8, 0x6b, 0x95, 0x8a, 0x74, 0xb4, 0x4a, 0x55, 0xab, 0x25, 0xdb, 0xc4, 0x3a, 0xfa, 0x4, 0x1b, 0xe5, 0x86, 0x78, 0x67, 0x99, 0x59, 0xa7, 0xb8, 0x46, 0x7e, 0x80, 0x9f, 0x61, 0xa1, 0x5f, 0x40, 0xbe, 0xdd, 0x23, 0x3c, 0xc2, 0x2, 0xfc, 0xe3, 0x1d}, + {0x0, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c, 0x4b, 0xb4, 0xa8, 0x57, 0x90, 0x6f, 0x73, 0x8c, 0xe0, 0x1f, 0x3, 0xfc, 0x3b, 0xc4, 0xd8, 0x27, 0x96, 0x69, 0x75, 0x8a, 0x4d, 0xb2, 0xae, 0x51, 0x3d, 0xc2, 0xde, 0x21, 0xe6, 0x19, 0x5, 0xfa, 0xdd, 0x22, 0x3e, 0xc1, 0x6, 0xf9, 0xe5, 0x1a, 0x76, 0x89, 0x95, 0x6a, 0xad, 0x52, 0x4e, 0xb1, 0x31, 0xce, 0xd2, 0x2d, 0xea, 0x15, 0x9, 0xf6, 0x9a, 0x65, 0x79, 0x86, 0x41, 0xbe, 0xa2, 0x5d, 0x7a, 0x85, 0x99, 0x66, 0xa1, 0x5e, 0x42, 0xbd, 0xd1, 0x2e, 0x32, 0xcd, 0xa, 0xf5, 0xe9, 0x16, 0xa7, 0x58, 0x44, 0xbb, 0x7c, 0x83, 0x9f, 0x60, 0xc, 0xf3, 0xef, 0x10, 0xd7, 0x28, 0x34, 0xcb, 0xec, 0x13, 0xf, 0xf0, 0x37, 0xc8, 0xd4, 0x2b, 0x47, 0xb8, 0xa4, 0x5b, 0x9c, 0x63, 0x7f, 0x80, 0x62, 0x9d, 0x81, 0x7e, 0xb9, 0x46, 0x5a, 0xa5, 0xc9, 0x36, 0x2a, 0xd5, 0x12, 0xed, 0xf1, 0xe, 0x29, 0xd6, 0xca, 0x35, 0xf2, 0xd, 0x11, 0xee, 0x82, 0x7d, 0x61, 0x9e, 0x59, 0xa6, 0xba, 0x45, 0xf4, 0xb, 0x17, 0xe8, 0x2f, 0xd0, 0xcc, 0x33, 0x5f, 0xa0, 0xbc, 0x43, 0x84, 0x7b, 0x67, 0x98, 0xbf, 0x40, 0x5c, 0xa3, 0x64, 0x9b, 0x87, 0x78, 0x14, 0xeb, 0xf7, 0x8, 0xcf, 0x30, 0x2c, 0xd3, 0x53, 0xac, 0xb0, 0x4f, 0x88, 0x77, 0x6b, 0x94, 0xf8, 0x7, 0x1b, 0xe4, 0x23, 0xdc, 0xc0, 0x3f, 0x18, 0xe7, 0xfb, 0x4, 0xc3, 0x3c, 0x20, 0xdf, 0xb3, 0x4c, 0x50, 0xaf, 0x68, 0x97, 0x8b, 0x74, 0xc5, 0x3a, 0x26, 0xd9, 0x1e, 0xe1, 0xfd, 0x2, 0x6e, 0x91, 0x8d, 0x72, 0xb5, 0x4a, 0x56, 0xa9, 0x8e, 0x71, 0x6d, 0x92, 0x55, 0xaa, 0xb6, 0x49, 0x25, 0xda, 0xc6, 0x39, 0xfe, 0x1, 0x1d, 0xe2}} + +var mulTableLow = [256][16]uint8{{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf}, + {0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e}, + {0x0, 0x3, 0x6, 0x5, 0xc, 0xf, 0xa, 0x9, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11}, + {0x0, 0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c}, + {0x0, 0x5, 0xa, 0xf, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33}, + {0x0, 0x6, 0xc, 0xa, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22}, + {0x0, 0x7, 0xe, 0x9, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d}, + {0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78}, + {0x0, 0x9, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77}, + {0x0, 0xa, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66}, + {0x0, 0xb, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69}, + {0x0, 0xc, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44}, + {0x0, 0xd, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b}, + {0x0, 0xe, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a}, + {0x0, 0xf, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55}, + {0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0}, + {0x0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}, + {0x0, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee}, + {0x0, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1}, + {0x0, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc}, + {0x0, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3}, + {0x0, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2}, + {0x0, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd}, + {0x0, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88}, + {0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87}, + {0x0, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96}, + {0x0, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99}, + {0x0, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4}, + {0x0, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb}, + {0x0, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa}, + {0x0, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5}, + {0x0, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd}, + {0x0, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2}, + {0x0, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0xd, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3}, + {0x0, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x5, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec}, + {0x0, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1}, + {0x0, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce}, + {0x0, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0xb, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf}, + {0x0, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x2, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0}, + {0x0, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0xd, 0x25, 0xfd, 0xd5, 0xad, 0x85}, + {0x0, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x7, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a}, + {0x0, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b}, + {0x0, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94}, + {0x0, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x9, 0xcd, 0xe1, 0x95, 0xb9}, + {0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x2, 0xc1, 0xec, 0x9b, 0xb6}, + {0x0, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7}, + {0x0, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8}, + {0x0, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0xd}, + {0x0, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x2}, + {0x0, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13}, + {0x0, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c}, + {0x0, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x5, 0x31}, + {0x0, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0xb, 0x3e}, + {0x0, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f}, + {0x0, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20}, + {0x0, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x5, 0x4d, 0x75}, + {0x0, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x8, 0x43, 0x7a}, + {0x0, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b}, + {0x0, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64}, + {0x0, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0xd, 0x31, 0x75, 0x49}, + {0x0, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x1, 0x3c, 0x7b, 0x46}, + {0x0, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57}, + {0x0, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58}, + {0x0, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7}, + {0x0, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8}, + {0x0, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9}, + {0x0, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6}, + {0x0, 0x44, 0x88, 0xcc, 0xd, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb}, + {0x0, 0x45, 0x8a, 0xcf, 0x9, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4}, + {0x0, 0x46, 0x8c, 0xca, 0x5, 0x43, 0x89, 0xcf, 0xa, 0x4c, 0x86, 0xc0, 0xf, 0x49, 0x83, 0xc5}, + {0x0, 0x47, 0x8e, 0xc9, 0x1, 0x46, 0x8f, 0xc8, 0x2, 0x45, 0x8c, 0xcb, 0x3, 0x44, 0x8d, 0xca}, + {0x0, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0xf, 0xd7, 0x9f}, + {0x0, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x2, 0xd9, 0x90}, + {0x0, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81}, + {0x0, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e}, + {0x0, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3}, + {0x0, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac}, + {0x0, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x4, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd}, + {0x0, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0xd, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2}, + {0x0, 0x50, 0xa0, 0xf0, 0x5d, 0xd, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17}, + {0x0, 0x51, 0xa2, 0xf3, 0x59, 0x8, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18}, + {0x0, 0x52, 0xa4, 0xf6, 0x55, 0x7, 0xf1, 0xa3, 0xaa, 0xf8, 0xe, 0x5c, 0xff, 0xad, 0x5b, 0x9}, + {0x0, 0x53, 0xa6, 0xf5, 0x51, 0x2, 0xf7, 0xa4, 0xa2, 0xf1, 0x4, 0x57, 0xf3, 0xa0, 0x55, 0x6}, + {0x0, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b}, + {0x0, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24}, + {0x0, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35}, + {0x0, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a}, + {0x0, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f}, + {0x0, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60}, + {0x0, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x4, 0x9f, 0xc5, 0x2b, 0x71}, + {0x0, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0xf, 0x93, 0xc8, 0x25, 0x7e}, + {0x0, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0xf, 0x53}, + {0x0, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x1, 0x5c}, + {0x0, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d}, + {0x0, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42}, + {0x0, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a}, + {0x0, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15}, + {0x0, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x4}, + {0x0, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0xb}, + {0x0, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x7, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26}, + {0x0, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0xf, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29}, + {0x0, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38}, + {0x0, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37}, + {0x0, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x5, 0x67, 0xf, 0xb7, 0xdf, 0xda, 0xb2, 0xa, 0x62}, + {0x0, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x2, 0x6f, 0x6, 0xbd, 0xd4, 0xd6, 0xbf, 0x4, 0x6d}, + {0x0, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0xb, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c}, + {0x0, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0xc, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73}, + {0x0, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e}, + {0x0, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51}, + {0x0, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40}, + {0x0, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f}, + {0x0, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0xa, 0x9a, 0xea}, + {0x0, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x7, 0x94, 0xe5}, + {0x0, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4}, + {0x0, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb}, + {0x0, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6}, + {0x0, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9}, + {0x0, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0xd, 0x52, 0x24, 0xbe, 0xc8}, + {0x0, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x6, 0x5e, 0x29, 0xb0, 0xc7}, + {0x0, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0xd, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92}, + {0x0, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0xb, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d}, + {0x0, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x1, 0x7b, 0xf7, 0x8d, 0x3, 0x79, 0x2, 0x78, 0xf6, 0x8c}, + {0x0, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x7, 0x7c, 0xff, 0x84, 0x9, 0x72, 0xe, 0x75, 0xf8, 0x83}, + {0x0, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae}, + {0x0, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1}, + {0x0, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0}, + {0x0, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf}, + {0x0, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3}, + {0x0, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc}, + {0x0, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd}, + {0x0, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2}, + {0x0, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef}, + {0x0, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0}, + {0x0, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1}, + {0x0, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe}, + {0x0, 0x88, 0xd, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab}, + {0x0, 0x89, 0xf, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4}, + {0x0, 0x8a, 0x9, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5}, + {0x0, 0x8b, 0xb, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba}, + {0x0, 0x8c, 0x5, 0x89, 0xa, 0x86, 0xf, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97}, + {0x0, 0x8d, 0x7, 0x8a, 0xe, 0x83, 0x9, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98}, + {0x0, 0x8e, 0x1, 0x8f, 0x2, 0x8c, 0x3, 0x8d, 0x4, 0x8a, 0x5, 0x8b, 0x6, 0x88, 0x7, 0x89}, + {0x0, 0x8f, 0x3, 0x8c, 0x6, 0x89, 0x5, 0x8a, 0xc, 0x83, 0xf, 0x80, 0xa, 0x85, 0x9, 0x86}, + {0x0, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23}, + {0x0, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c}, + {0x0, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x4, 0xaf, 0x3d}, + {0x0, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x9, 0xa1, 0x32}, + {0x0, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f}, + {0x0, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10}, + {0x0, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x1}, + {0x0, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0xe}, + {0x0, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x1, 0xee, 0x76, 0xc3, 0x5b}, + {0x0, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0xa, 0xe2, 0x7b, 0xcd, 0x54}, + {0x0, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45}, + {0x0, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a}, + {0x0, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x8, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67}, + {0x0, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x1, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68}, + {0x0, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79}, + {0x0, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76}, + {0x0, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e}, + {0x0, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21}, + {0x0, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30}, + {0x0, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f}, + {0x0, 0xa4, 0x55, 0xf1, 0xaa, 0xe, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12}, + {0x0, 0xa5, 0x57, 0xf2, 0xae, 0xb, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d}, + {0x0, 0xa6, 0x51, 0xf7, 0xa2, 0x4, 0xf3, 0x55, 0x59, 0xff, 0x8, 0xae, 0xfb, 0x5d, 0xaa, 0xc}, + {0x0, 0xa7, 0x53, 0xf4, 0xa6, 0x1, 0xf5, 0x52, 0x51, 0xf6, 0x2, 0xa5, 0xf7, 0x50, 0xa4, 0x3}, + {0x0, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56}, + {0x0, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59}, + {0x0, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x1, 0xe2, 0x48}, + {0x0, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0xc, 0xec, 0x47}, + {0x0, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x9, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a}, + {0x0, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x1, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65}, + {0x0, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74}, + {0x0, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b}, + {0x0, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde}, + {0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1}, + {0x0, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0xb, 0xb9, 0x72, 0xc0}, + {0x0, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x7, 0xb4, 0x7c, 0xcf}, + {0x0, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x8, 0x23, 0x97, 0x56, 0xe2}, + {0x0, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x3, 0x2f, 0x9a, 0x58, 0xed}, + {0x0, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc}, + {0x0, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3}, + {0x0, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0xf, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6}, + {0x0, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x8, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9}, + {0x0, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x1, 0xb9, 0x3, 0xd0, 0x6a, 0x6b, 0xd1, 0x2, 0xb8}, + {0x0, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x6, 0xb1, 0xa, 0xda, 0x61, 0x67, 0xdc, 0xc, 0xb7}, + {0x0, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a}, + {0x0, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95}, + {0x0, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84}, + {0x0, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b}, + {0x0, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34}, + {0x0, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b}, + {0x0, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x5, 0x71, 0xb3, 0xe8, 0x2a}, + {0x0, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0xe, 0x7d, 0xbe, 0xe6, 0x25}, + {0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x8}, + {0x0, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x7}, + {0x0, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16}, + {0x0, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19}, + {0x0, 0xc8, 0x8d, 0x45, 0x7, 0xcf, 0x8a, 0x42, 0xe, 0xc6, 0x83, 0x4b, 0x9, 0xc1, 0x84, 0x4c}, + {0x0, 0xc9, 0x8f, 0x46, 0x3, 0xca, 0x8c, 0x45, 0x6, 0xcf, 0x89, 0x40, 0x5, 0xcc, 0x8a, 0x43}, + {0x0, 0xca, 0x89, 0x43, 0xf, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52}, + {0x0, 0xcb, 0x8b, 0x40, 0xb, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d}, + {0x0, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70}, + {0x0, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f}, + {0x0, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e}, + {0x0, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61}, + {0x0, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0xa, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4}, + {0x0, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0xd, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb}, + {0x0, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x4, 0xde, 0xc, 0x67, 0xb5, 0xb1, 0x63, 0x8, 0xda}, + {0x0, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x3, 0xd6, 0x5, 0x6d, 0xbe, 0xbd, 0x6e, 0x6, 0xd5}, + {0x0, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8}, + {0x0, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7}, + {0x0, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6}, + {0x0, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9}, + {0x0, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc}, + {0x0, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3}, + {0x0, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0xb, 0x78, 0xa2}, + {0x0, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x6, 0x76, 0xad}, + {0x0, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0xb, 0xd7, 0xf9, 0x25, 0x5c, 0x80}, + {0x0, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x1, 0xdc, 0xf5, 0x28, 0x52, 0x8f}, + {0x0, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e}, + {0x0, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91}, + {0x0, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9}, + {0x0, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6}, + {0x0, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0xe, 0x35, 0xd7}, + {0x0, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x3, 0x3b, 0xd8}, + {0x0, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5}, + {0x0, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa}, + {0x0, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0xd, 0xeb}, + {0x0, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x3, 0xe4}, + {0x0, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1}, + {0x0, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe}, + {0x0, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x3, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf}, + {0x0, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0xb, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0}, + {0x0, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d}, + {0x0, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82}, + {0x0, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0xc, 0xbc, 0x52, 0x7d, 0x93}, + {0x0, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x7, 0xb0, 0x5f, 0x73, 0x9c}, + {0x0, 0xf0, 0xfd, 0xd, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39}, + {0x0, 0xf1, 0xff, 0xe, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36}, + {0x0, 0xf2, 0xf9, 0xb, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27}, + {0x0, 0xf3, 0xfb, 0x8, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28}, + {0x0, 0xf4, 0xf5, 0x1, 0xf7, 0x3, 0x2, 0xf6, 0xf3, 0x7, 0x6, 0xf2, 0x4, 0xf0, 0xf1, 0x5}, + {0x0, 0xf5, 0xf7, 0x2, 0xf3, 0x6, 0x4, 0xf1, 0xfb, 0xe, 0xc, 0xf9, 0x8, 0xfd, 0xff, 0xa}, + {0x0, 0xf6, 0xf1, 0x7, 0xff, 0x9, 0xe, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b}, + {0x0, 0xf7, 0xf3, 0x4, 0xfb, 0xc, 0x8, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14}, + {0x0, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41}, + {0x0, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e}, + {0x0, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f}, + {0x0, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50}, + {0x0, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d}, + {0x0, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72}, + {0x0, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63}, + {0x0, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c}} +var mulTableHigh = [256][16]uint8{{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, + {0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0}, + {0x0, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x1d, 0x3d, 0x5d, 0x7d, 0x9d, 0xbd, 0xdd, 0xfd}, + {0x0, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x9d, 0xad, 0xfd, 0xcd, 0x5d, 0x6d, 0x3d, 0xd}, + {0x0, 0x40, 0x80, 0xc0, 0x1d, 0x5d, 0x9d, 0xdd, 0x3a, 0x7a, 0xba, 0xfa, 0x27, 0x67, 0xa7, 0xe7}, + {0x0, 0x50, 0xa0, 0xf0, 0x5d, 0xd, 0xfd, 0xad, 0xba, 0xea, 0x1a, 0x4a, 0xe7, 0xb7, 0x47, 0x17}, + {0x0, 0x60, 0xc0, 0xa0, 0x9d, 0xfd, 0x5d, 0x3d, 0x27, 0x47, 0xe7, 0x87, 0xba, 0xda, 0x7a, 0x1a}, + {0x0, 0x70, 0xe0, 0x90, 0xdd, 0xad, 0x3d, 0x4d, 0xa7, 0xd7, 0x47, 0x37, 0x7a, 0xa, 0x9a, 0xea}, + {0x0, 0x80, 0x1d, 0x9d, 0x3a, 0xba, 0x27, 0xa7, 0x74, 0xf4, 0x69, 0xe9, 0x4e, 0xce, 0x53, 0xd3}, + {0x0, 0x90, 0x3d, 0xad, 0x7a, 0xea, 0x47, 0xd7, 0xf4, 0x64, 0xc9, 0x59, 0x8e, 0x1e, 0xb3, 0x23}, + {0x0, 0xa0, 0x5d, 0xfd, 0xba, 0x1a, 0xe7, 0x47, 0x69, 0xc9, 0x34, 0x94, 0xd3, 0x73, 0x8e, 0x2e}, + {0x0, 0xb0, 0x7d, 0xcd, 0xfa, 0x4a, 0x87, 0x37, 0xe9, 0x59, 0x94, 0x24, 0x13, 0xa3, 0x6e, 0xde}, + {0x0, 0xc0, 0x9d, 0x5d, 0x27, 0xe7, 0xba, 0x7a, 0x4e, 0x8e, 0xd3, 0x13, 0x69, 0xa9, 0xf4, 0x34}, + {0x0, 0xd0, 0xbd, 0x6d, 0x67, 0xb7, 0xda, 0xa, 0xce, 0x1e, 0x73, 0xa3, 0xa9, 0x79, 0x14, 0xc4}, + {0x0, 0xe0, 0xdd, 0x3d, 0xa7, 0x47, 0x7a, 0x9a, 0x53, 0xb3, 0x8e, 0x6e, 0xf4, 0x14, 0x29, 0xc9}, + {0x0, 0xf0, 0xfd, 0xd, 0xe7, 0x17, 0x1a, 0xea, 0xd3, 0x23, 0x2e, 0xde, 0x34, 0xc4, 0xc9, 0x39}, + {0x0, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb}, + {0x0, 0xd, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b}, + {0x0, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xf5, 0xc8, 0x8f, 0xb2, 0x1, 0x3c, 0x7b, 0x46}, + {0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x75, 0x58, 0x2f, 0x2, 0xc1, 0xec, 0x9b, 0xb6}, + {0x0, 0x5d, 0xba, 0xe7, 0x69, 0x34, 0xd3, 0x8e, 0xd2, 0x8f, 0x68, 0x35, 0xbb, 0xe6, 0x1, 0x5c}, + {0x0, 0x4d, 0x9a, 0xd7, 0x29, 0x64, 0xb3, 0xfe, 0x52, 0x1f, 0xc8, 0x85, 0x7b, 0x36, 0xe1, 0xac}, + {0x0, 0x7d, 0xfa, 0x87, 0xe9, 0x94, 0x13, 0x6e, 0xcf, 0xb2, 0x35, 0x48, 0x26, 0x5b, 0xdc, 0xa1}, + {0x0, 0x6d, 0xda, 0xb7, 0xa9, 0xc4, 0x73, 0x1e, 0x4f, 0x22, 0x95, 0xf8, 0xe6, 0x8b, 0x3c, 0x51}, + {0x0, 0x9d, 0x27, 0xba, 0x4e, 0xd3, 0x69, 0xf4, 0x9c, 0x1, 0xbb, 0x26, 0xd2, 0x4f, 0xf5, 0x68}, + {0x0, 0x8d, 0x7, 0x8a, 0xe, 0x83, 0x9, 0x84, 0x1c, 0x91, 0x1b, 0x96, 0x12, 0x9f, 0x15, 0x98}, + {0x0, 0xbd, 0x67, 0xda, 0xce, 0x73, 0xa9, 0x14, 0x81, 0x3c, 0xe6, 0x5b, 0x4f, 0xf2, 0x28, 0x95}, + {0x0, 0xad, 0x47, 0xea, 0x8e, 0x23, 0xc9, 0x64, 0x1, 0xac, 0x46, 0xeb, 0x8f, 0x22, 0xc8, 0x65}, + {0x0, 0xdd, 0xa7, 0x7a, 0x53, 0x8e, 0xf4, 0x29, 0xa6, 0x7b, 0x1, 0xdc, 0xf5, 0x28, 0x52, 0x8f}, + {0x0, 0xcd, 0x87, 0x4a, 0x13, 0xde, 0x94, 0x59, 0x26, 0xeb, 0xa1, 0x6c, 0x35, 0xf8, 0xb2, 0x7f}, + {0x0, 0xfd, 0xe7, 0x1a, 0xd3, 0x2e, 0x34, 0xc9, 0xbb, 0x46, 0x5c, 0xa1, 0x68, 0x95, 0x8f, 0x72}, + {0x0, 0xed, 0xc7, 0x2a, 0x93, 0x7e, 0x54, 0xb9, 0x3b, 0xd6, 0xfc, 0x11, 0xa8, 0x45, 0x6f, 0x82}, + {0x0, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b}, + {0x0, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x4d, 0x67, 0x19, 0x33, 0xe5, 0xcf, 0xb1, 0x9b}, + {0x0, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96}, + {0x0, 0xa, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66}, + {0x0, 0x7a, 0xf4, 0x8e, 0xf5, 0x8f, 0x1, 0x7b, 0xf7, 0x8d, 0x3, 0x79, 0x2, 0x78, 0xf6, 0x8c}, + {0x0, 0x6a, 0xd4, 0xbe, 0xb5, 0xdf, 0x61, 0xb, 0x77, 0x1d, 0xa3, 0xc9, 0xc2, 0xa8, 0x16, 0x7c}, + {0x0, 0x5a, 0xb4, 0xee, 0x75, 0x2f, 0xc1, 0x9b, 0xea, 0xb0, 0x5e, 0x4, 0x9f, 0xc5, 0x2b, 0x71}, + {0x0, 0x4a, 0x94, 0xde, 0x35, 0x7f, 0xa1, 0xeb, 0x6a, 0x20, 0xfe, 0xb4, 0x5f, 0x15, 0xcb, 0x81}, + {0x0, 0xba, 0x69, 0xd3, 0xd2, 0x68, 0xbb, 0x1, 0xb9, 0x3, 0xd0, 0x6a, 0x6b, 0xd1, 0x2, 0xb8}, + {0x0, 0xaa, 0x49, 0xe3, 0x92, 0x38, 0xdb, 0x71, 0x39, 0x93, 0x70, 0xda, 0xab, 0x1, 0xe2, 0x48}, + {0x0, 0x9a, 0x29, 0xb3, 0x52, 0xc8, 0x7b, 0xe1, 0xa4, 0x3e, 0x8d, 0x17, 0xf6, 0x6c, 0xdf, 0x45}, + {0x0, 0x8a, 0x9, 0x83, 0x12, 0x98, 0x1b, 0x91, 0x24, 0xae, 0x2d, 0xa7, 0x36, 0xbc, 0x3f, 0xb5}, + {0x0, 0xfa, 0xe9, 0x13, 0xcf, 0x35, 0x26, 0xdc, 0x83, 0x79, 0x6a, 0x90, 0x4c, 0xb6, 0xa5, 0x5f}, + {0x0, 0xea, 0xc9, 0x23, 0x8f, 0x65, 0x46, 0xac, 0x3, 0xe9, 0xca, 0x20, 0x8c, 0x66, 0x45, 0xaf}, + {0x0, 0xda, 0xa9, 0x73, 0x4f, 0x95, 0xe6, 0x3c, 0x9e, 0x44, 0x37, 0xed, 0xd1, 0xb, 0x78, 0xa2}, + {0x0, 0xca, 0x89, 0x43, 0xf, 0xc5, 0x86, 0x4c, 0x1e, 0xd4, 0x97, 0x5d, 0x11, 0xdb, 0x98, 0x52}, + {0x0, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x2, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0}, + {0x0, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xa5, 0x92, 0xcb, 0xfc, 0x79, 0x4e, 0x17, 0x20}, + {0x0, 0x7, 0xe, 0x9, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d}, + {0x0, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd}, + {0x0, 0x67, 0xce, 0xa9, 0x81, 0xe6, 0x4f, 0x28, 0x1f, 0x78, 0xd1, 0xb6, 0x9e, 0xf9, 0x50, 0x37}, + {0x0, 0x77, 0xee, 0x99, 0xc1, 0xb6, 0x2f, 0x58, 0x9f, 0xe8, 0x71, 0x6, 0x5e, 0x29, 0xb0, 0xc7}, + {0x0, 0x47, 0x8e, 0xc9, 0x1, 0x46, 0x8f, 0xc8, 0x2, 0x45, 0x8c, 0xcb, 0x3, 0x44, 0x8d, 0xca}, + {0x0, 0x57, 0xae, 0xf9, 0x41, 0x16, 0xef, 0xb8, 0x82, 0xd5, 0x2c, 0x7b, 0xc3, 0x94, 0x6d, 0x3a}, + {0x0, 0xa7, 0x53, 0xf4, 0xa6, 0x1, 0xf5, 0x52, 0x51, 0xf6, 0x2, 0xa5, 0xf7, 0x50, 0xa4, 0x3}, + {0x0, 0xb7, 0x73, 0xc4, 0xe6, 0x51, 0x95, 0x22, 0xd1, 0x66, 0xa2, 0x15, 0x37, 0x80, 0x44, 0xf3}, + {0x0, 0x87, 0x13, 0x94, 0x26, 0xa1, 0x35, 0xb2, 0x4c, 0xcb, 0x5f, 0xd8, 0x6a, 0xed, 0x79, 0xfe}, + {0x0, 0x97, 0x33, 0xa4, 0x66, 0xf1, 0x55, 0xc2, 0xcc, 0x5b, 0xff, 0x68, 0xaa, 0x3d, 0x99, 0xe}, + {0x0, 0xe7, 0xd3, 0x34, 0xbb, 0x5c, 0x68, 0x8f, 0x6b, 0x8c, 0xb8, 0x5f, 0xd0, 0x37, 0x3, 0xe4}, + {0x0, 0xf7, 0xf3, 0x4, 0xfb, 0xc, 0x8, 0xff, 0xeb, 0x1c, 0x18, 0xef, 0x10, 0xe7, 0xe3, 0x14}, + {0x0, 0xc7, 0x93, 0x54, 0x3b, 0xfc, 0xa8, 0x6f, 0x76, 0xb1, 0xe5, 0x22, 0x4d, 0x8a, 0xde, 0x19}, + {0x0, 0xd7, 0xb3, 0x64, 0x7b, 0xac, 0xc8, 0x1f, 0xf6, 0x21, 0x45, 0x92, 0x8d, 0x5a, 0x3e, 0xe9}, + {0x0, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6}, + {0x0, 0x64, 0xc8, 0xac, 0x8d, 0xe9, 0x45, 0x21, 0x7, 0x63, 0xcf, 0xab, 0x8a, 0xee, 0x42, 0x26}, + {0x0, 0x54, 0xa8, 0xfc, 0x4d, 0x19, 0xe5, 0xb1, 0x9a, 0xce, 0x32, 0x66, 0xd7, 0x83, 0x7f, 0x2b}, + {0x0, 0x44, 0x88, 0xcc, 0xd, 0x49, 0x85, 0xc1, 0x1a, 0x5e, 0x92, 0xd6, 0x17, 0x53, 0x9f, 0xdb}, + {0x0, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xbd, 0x89, 0xd5, 0xe1, 0x6d, 0x59, 0x5, 0x31}, + {0x0, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x3d, 0x19, 0x75, 0x51, 0xad, 0x89, 0xe5, 0xc1}, + {0x0, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc}, + {0x0, 0x4, 0x8, 0xc, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c}, + {0x0, 0xf4, 0xf5, 0x1, 0xf7, 0x3, 0x2, 0xf6, 0xf3, 0x7, 0x6, 0xf2, 0x4, 0xf0, 0xf1, 0x5}, + {0x0, 0xe4, 0xd5, 0x31, 0xb7, 0x53, 0x62, 0x86, 0x73, 0x97, 0xa6, 0x42, 0xc4, 0x20, 0x11, 0xf5}, + {0x0, 0xd4, 0xb5, 0x61, 0x77, 0xa3, 0xc2, 0x16, 0xee, 0x3a, 0x5b, 0x8f, 0x99, 0x4d, 0x2c, 0xf8}, + {0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0x66, 0x6e, 0xaa, 0xfb, 0x3f, 0x59, 0x9d, 0xcc, 0x8}, + {0x0, 0xb4, 0x75, 0xc1, 0xea, 0x5e, 0x9f, 0x2b, 0xc9, 0x7d, 0xbc, 0x8, 0x23, 0x97, 0x56, 0xe2}, + {0x0, 0xa4, 0x55, 0xf1, 0xaa, 0xe, 0xff, 0x5b, 0x49, 0xed, 0x1c, 0xb8, 0xe3, 0x47, 0xb6, 0x12}, + {0x0, 0x94, 0x35, 0xa1, 0x6a, 0xfe, 0x5f, 0xcb, 0xd4, 0x40, 0xe1, 0x75, 0xbe, 0x2a, 0x8b, 0x1f}, + {0x0, 0x84, 0x15, 0x91, 0x2a, 0xae, 0x3f, 0xbb, 0x54, 0xd0, 0x41, 0xc5, 0x7e, 0xfa, 0x6b, 0xef}, + {0x0, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x2, 0x6f, 0x6, 0xbd, 0xd4, 0xd6, 0xbf, 0x4, 0x6d}, + {0x0, 0x79, 0xf2, 0x8b, 0xf9, 0x80, 0xb, 0x72, 0xef, 0x96, 0x1d, 0x64, 0x16, 0x6f, 0xe4, 0x9d}, + {0x0, 0x49, 0x92, 0xdb, 0x39, 0x70, 0xab, 0xe2, 0x72, 0x3b, 0xe0, 0xa9, 0x4b, 0x2, 0xd9, 0x90}, + {0x0, 0x59, 0xb2, 0xeb, 0x79, 0x20, 0xcb, 0x92, 0xf2, 0xab, 0x40, 0x19, 0x8b, 0xd2, 0x39, 0x60}, + {0x0, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x55, 0x7c, 0x7, 0x2e, 0xf1, 0xd8, 0xa3, 0x8a}, + {0x0, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xd5, 0xec, 0xa7, 0x9e, 0x31, 0x8, 0x43, 0x7a}, + {0x0, 0x9, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77}, + {0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87}, + {0x0, 0xe9, 0xcf, 0x26, 0x83, 0x6a, 0x4c, 0xa5, 0x1b, 0xf2, 0xd4, 0x3d, 0x98, 0x71, 0x57, 0xbe}, + {0x0, 0xf9, 0xef, 0x16, 0xc3, 0x3a, 0x2c, 0xd5, 0x9b, 0x62, 0x74, 0x8d, 0x58, 0xa1, 0xb7, 0x4e}, + {0x0, 0xc9, 0x8f, 0x46, 0x3, 0xca, 0x8c, 0x45, 0x6, 0xcf, 0x89, 0x40, 0x5, 0xcc, 0x8a, 0x43}, + {0x0, 0xd9, 0xaf, 0x76, 0x43, 0x9a, 0xec, 0x35, 0x86, 0x5f, 0x29, 0xf0, 0xc5, 0x1c, 0x6a, 0xb3}, + {0x0, 0xa9, 0x4f, 0xe6, 0x9e, 0x37, 0xd1, 0x78, 0x21, 0x88, 0x6e, 0xc7, 0xbf, 0x16, 0xf0, 0x59}, + {0x0, 0xb9, 0x6f, 0xd6, 0xde, 0x67, 0xb1, 0x8, 0xa1, 0x18, 0xce, 0x77, 0x7f, 0xc6, 0x10, 0xa9}, + {0x0, 0x89, 0xf, 0x86, 0x1e, 0x97, 0x11, 0x98, 0x3c, 0xb5, 0x33, 0xba, 0x22, 0xab, 0x2d, 0xa4}, + {0x0, 0x99, 0x2f, 0xb6, 0x5e, 0xc7, 0x71, 0xe8, 0xbc, 0x25, 0x93, 0xa, 0xe2, 0x7b, 0xcd, 0x54}, + {0x0, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x4, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd}, + {0x0, 0x5e, 0xbc, 0xe2, 0x65, 0x3b, 0xd9, 0x87, 0xca, 0x94, 0x76, 0x28, 0xaf, 0xf1, 0x13, 0x4d}, + {0x0, 0x6e, 0xdc, 0xb2, 0xa5, 0xcb, 0x79, 0x17, 0x57, 0x39, 0x8b, 0xe5, 0xf2, 0x9c, 0x2e, 0x40}, + {0x0, 0x7e, 0xfc, 0x82, 0xe5, 0x9b, 0x19, 0x67, 0xd7, 0xa9, 0x2b, 0x55, 0x32, 0x4c, 0xce, 0xb0}, + {0x0, 0xe, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a}, + {0x0, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa}, + {0x0, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x6d, 0x43, 0x31, 0x1f, 0xd5, 0xfb, 0x89, 0xa7}, + {0x0, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xed, 0xd3, 0x91, 0xaf, 0x15, 0x2b, 0x69, 0x57}, + {0x0, 0xce, 0x81, 0x4f, 0x1f, 0xd1, 0x9e, 0x50, 0x3e, 0xf0, 0xbf, 0x71, 0x21, 0xef, 0xa0, 0x6e}, + {0x0, 0xde, 0xa1, 0x7f, 0x5f, 0x81, 0xfe, 0x20, 0xbe, 0x60, 0x1f, 0xc1, 0xe1, 0x3f, 0x40, 0x9e}, + {0x0, 0xee, 0xc1, 0x2f, 0x9f, 0x71, 0x5e, 0xb0, 0x23, 0xcd, 0xe2, 0xc, 0xbc, 0x52, 0x7d, 0x93}, + {0x0, 0xfe, 0xe1, 0x1f, 0xdf, 0x21, 0x3e, 0xc0, 0xa3, 0x5d, 0x42, 0xbc, 0x7c, 0x82, 0x9d, 0x63}, + {0x0, 0x8e, 0x1, 0x8f, 0x2, 0x8c, 0x3, 0x8d, 0x4, 0x8a, 0x5, 0x8b, 0x6, 0x88, 0x7, 0x89}, + {0x0, 0x9e, 0x21, 0xbf, 0x42, 0xdc, 0x63, 0xfd, 0x84, 0x1a, 0xa5, 0x3b, 0xc6, 0x58, 0xe7, 0x79}, + {0x0, 0xae, 0x41, 0xef, 0x82, 0x2c, 0xc3, 0x6d, 0x19, 0xb7, 0x58, 0xf6, 0x9b, 0x35, 0xda, 0x74}, + {0x0, 0xbe, 0x61, 0xdf, 0xc2, 0x7c, 0xa3, 0x1d, 0x99, 0x27, 0xf8, 0x46, 0x5b, 0xe5, 0x3a, 0x84}, + {0x0, 0x53, 0xa6, 0xf5, 0x51, 0x2, 0xf7, 0xa4, 0xa2, 0xf1, 0x4, 0x57, 0xf3, 0xa0, 0x55, 0x6}, + {0x0, 0x43, 0x86, 0xc5, 0x11, 0x52, 0x97, 0xd4, 0x22, 0x61, 0xa4, 0xe7, 0x33, 0x70, 0xb5, 0xf6}, + {0x0, 0x73, 0xe6, 0x95, 0xd1, 0xa2, 0x37, 0x44, 0xbf, 0xcc, 0x59, 0x2a, 0x6e, 0x1d, 0x88, 0xfb}, + {0x0, 0x63, 0xc6, 0xa5, 0x91, 0xf2, 0x57, 0x34, 0x3f, 0x5c, 0xf9, 0x9a, 0xae, 0xcd, 0x68, 0xb}, + {0x0, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1}, + {0x0, 0x3, 0x6, 0x5, 0xc, 0xf, 0xa, 0x9, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11}, + {0x0, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x85, 0xb6, 0xe3, 0xd0, 0x49, 0x7a, 0x2f, 0x1c}, + {0x0, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x5, 0x26, 0x43, 0x60, 0x89, 0xaa, 0xcf, 0xec}, + {0x0, 0xd3, 0xbb, 0x68, 0x6b, 0xb8, 0xd0, 0x3, 0xd6, 0x5, 0x6d, 0xbe, 0xbd, 0x6e, 0x6, 0xd5}, + {0x0, 0xc3, 0x9b, 0x58, 0x2b, 0xe8, 0xb0, 0x73, 0x56, 0x95, 0xcd, 0xe, 0x7d, 0xbe, 0xe6, 0x25}, + {0x0, 0xf3, 0xfb, 0x8, 0xeb, 0x18, 0x10, 0xe3, 0xcb, 0x38, 0x30, 0xc3, 0x20, 0xd3, 0xdb, 0x28}, + {0x0, 0xe3, 0xdb, 0x38, 0xab, 0x48, 0x70, 0x93, 0x4b, 0xa8, 0x90, 0x73, 0xe0, 0x3, 0x3b, 0xd8}, + {0x0, 0x93, 0x3b, 0xa8, 0x76, 0xe5, 0x4d, 0xde, 0xec, 0x7f, 0xd7, 0x44, 0x9a, 0x9, 0xa1, 0x32}, + {0x0, 0x83, 0x1b, 0x98, 0x36, 0xb5, 0x2d, 0xae, 0x6c, 0xef, 0x77, 0xf4, 0x5a, 0xd9, 0x41, 0xc2}, + {0x0, 0xb3, 0x7b, 0xc8, 0xf6, 0x45, 0x8d, 0x3e, 0xf1, 0x42, 0x8a, 0x39, 0x7, 0xb4, 0x7c, 0xcf}, + {0x0, 0xa3, 0x5b, 0xf8, 0xb6, 0x15, 0xed, 0x4e, 0x71, 0xd2, 0x2a, 0x89, 0xc7, 0x64, 0x9c, 0x3f}, + {0x0, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1}, + {0x0, 0xf8, 0xed, 0x15, 0xc7, 0x3f, 0x2a, 0xd2, 0x93, 0x6b, 0x7e, 0x86, 0x54, 0xac, 0xb9, 0x41}, + {0x0, 0xc8, 0x8d, 0x45, 0x7, 0xcf, 0x8a, 0x42, 0xe, 0xc6, 0x83, 0x4b, 0x9, 0xc1, 0x84, 0x4c}, + {0x0, 0xd8, 0xad, 0x75, 0x47, 0x9f, 0xea, 0x32, 0x8e, 0x56, 0x23, 0xfb, 0xc9, 0x11, 0x64, 0xbc}, + {0x0, 0xa8, 0x4d, 0xe5, 0x9a, 0x32, 0xd7, 0x7f, 0x29, 0x81, 0x64, 0xcc, 0xb3, 0x1b, 0xfe, 0x56}, + {0x0, 0xb8, 0x6d, 0xd5, 0xda, 0x62, 0xb7, 0xf, 0xa9, 0x11, 0xc4, 0x7c, 0x73, 0xcb, 0x1e, 0xa6}, + {0x0, 0x88, 0xd, 0x85, 0x1a, 0x92, 0x17, 0x9f, 0x34, 0xbc, 0x39, 0xb1, 0x2e, 0xa6, 0x23, 0xab}, + {0x0, 0x98, 0x2d, 0xb5, 0x5a, 0xc2, 0x77, 0xef, 0xb4, 0x2c, 0x99, 0x1, 0xee, 0x76, 0xc3, 0x5b}, + {0x0, 0x68, 0xd0, 0xb8, 0xbd, 0xd5, 0x6d, 0x5, 0x67, 0xf, 0xb7, 0xdf, 0xda, 0xb2, 0xa, 0x62}, + {0x0, 0x78, 0xf0, 0x88, 0xfd, 0x85, 0xd, 0x75, 0xe7, 0x9f, 0x17, 0x6f, 0x1a, 0x62, 0xea, 0x92}, + {0x0, 0x48, 0x90, 0xd8, 0x3d, 0x75, 0xad, 0xe5, 0x7a, 0x32, 0xea, 0xa2, 0x47, 0xf, 0xd7, 0x9f}, + {0x0, 0x58, 0xb0, 0xe8, 0x7d, 0x25, 0xcd, 0x95, 0xfa, 0xa2, 0x4a, 0x12, 0x87, 0xdf, 0x37, 0x6f}, + {0x0, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x5d, 0x75, 0xd, 0x25, 0xfd, 0xd5, 0xad, 0x85}, + {0x0, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xdd, 0xe5, 0xad, 0x95, 0x3d, 0x5, 0x4d, 0x75}, + {0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78}, + {0x0, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88}, + {0x0, 0xf5, 0xf7, 0x2, 0xf3, 0x6, 0x4, 0xf1, 0xfb, 0xe, 0xc, 0xf9, 0x8, 0xfd, 0xff, 0xa}, + {0x0, 0xe5, 0xd7, 0x32, 0xb3, 0x56, 0x64, 0x81, 0x7b, 0x9e, 0xac, 0x49, 0xc8, 0x2d, 0x1f, 0xfa}, + {0x0, 0xd5, 0xb7, 0x62, 0x73, 0xa6, 0xc4, 0x11, 0xe6, 0x33, 0x51, 0x84, 0x95, 0x40, 0x22, 0xf7}, + {0x0, 0xc5, 0x97, 0x52, 0x33, 0xf6, 0xa4, 0x61, 0x66, 0xa3, 0xf1, 0x34, 0x55, 0x90, 0xc2, 0x7}, + {0x0, 0xb5, 0x77, 0xc2, 0xee, 0x5b, 0x99, 0x2c, 0xc1, 0x74, 0xb6, 0x3, 0x2f, 0x9a, 0x58, 0xed}, + {0x0, 0xa5, 0x57, 0xf2, 0xae, 0xb, 0xf9, 0x5c, 0x41, 0xe4, 0x16, 0xb3, 0xef, 0x4a, 0xb8, 0x1d}, + {0x0, 0x95, 0x37, 0xa2, 0x6e, 0xfb, 0x59, 0xcc, 0xdc, 0x49, 0xeb, 0x7e, 0xb2, 0x27, 0x85, 0x10}, + {0x0, 0x85, 0x17, 0x92, 0x2e, 0xab, 0x39, 0xbc, 0x5c, 0xd9, 0x4b, 0xce, 0x72, 0xf7, 0x65, 0xe0}, + {0x0, 0x75, 0xea, 0x9f, 0xc9, 0xbc, 0x23, 0x56, 0x8f, 0xfa, 0x65, 0x10, 0x46, 0x33, 0xac, 0xd9}, + {0x0, 0x65, 0xca, 0xaf, 0x89, 0xec, 0x43, 0x26, 0xf, 0x6a, 0xc5, 0xa0, 0x86, 0xe3, 0x4c, 0x29}, + {0x0, 0x55, 0xaa, 0xff, 0x49, 0x1c, 0xe3, 0xb6, 0x92, 0xc7, 0x38, 0x6d, 0xdb, 0x8e, 0x71, 0x24}, + {0x0, 0x45, 0x8a, 0xcf, 0x9, 0x4c, 0x83, 0xc6, 0x12, 0x57, 0x98, 0xdd, 0x1b, 0x5e, 0x91, 0xd4}, + {0x0, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xb5, 0x80, 0xdf, 0xea, 0x61, 0x54, 0xb, 0x3e}, + {0x0, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x35, 0x10, 0x7f, 0x5a, 0xa1, 0x84, 0xeb, 0xce}, + {0x0, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3}, + {0x0, 0x5, 0xa, 0xf, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33}, + {0x0, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x4, 0xde, 0xc, 0x67, 0xb5, 0xb1, 0x63, 0x8, 0xda}, + {0x0, 0xc2, 0x99, 0x5b, 0x2f, 0xed, 0xb6, 0x74, 0x5e, 0x9c, 0xc7, 0x5, 0x71, 0xb3, 0xe8, 0x2a}, + {0x0, 0xf2, 0xf9, 0xb, 0xef, 0x1d, 0x16, 0xe4, 0xc3, 0x31, 0x3a, 0xc8, 0x2c, 0xde, 0xd5, 0x27}, + {0x0, 0xe2, 0xd9, 0x3b, 0xaf, 0x4d, 0x76, 0x94, 0x43, 0xa1, 0x9a, 0x78, 0xec, 0xe, 0x35, 0xd7}, + {0x0, 0x92, 0x39, 0xab, 0x72, 0xe0, 0x4b, 0xd9, 0xe4, 0x76, 0xdd, 0x4f, 0x96, 0x4, 0xaf, 0x3d}, + {0x0, 0x82, 0x19, 0x9b, 0x32, 0xb0, 0x2b, 0xa9, 0x64, 0xe6, 0x7d, 0xff, 0x56, 0xd4, 0x4f, 0xcd}, + {0x0, 0xb2, 0x79, 0xcb, 0xf2, 0x40, 0x8b, 0x39, 0xf9, 0x4b, 0x80, 0x32, 0xb, 0xb9, 0x72, 0xc0}, + {0x0, 0xa2, 0x59, 0xfb, 0xb2, 0x10, 0xeb, 0x49, 0x79, 0xdb, 0x20, 0x82, 0xcb, 0x69, 0x92, 0x30}, + {0x0, 0x52, 0xa4, 0xf6, 0x55, 0x7, 0xf1, 0xa3, 0xaa, 0xf8, 0xe, 0x5c, 0xff, 0xad, 0x5b, 0x9}, + {0x0, 0x42, 0x84, 0xc6, 0x15, 0x57, 0x91, 0xd3, 0x2a, 0x68, 0xae, 0xec, 0x3f, 0x7d, 0xbb, 0xf9}, + {0x0, 0x72, 0xe4, 0x96, 0xd5, 0xa7, 0x31, 0x43, 0xb7, 0xc5, 0x53, 0x21, 0x62, 0x10, 0x86, 0xf4}, + {0x0, 0x62, 0xc4, 0xa6, 0x95, 0xf7, 0x51, 0x33, 0x37, 0x55, 0xf3, 0x91, 0xa2, 0xc0, 0x66, 0x4}, + {0x0, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee}, + {0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e}, + {0x0, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x8d, 0xbf, 0xe9, 0xdb, 0x45, 0x77, 0x21, 0x13}, + {0x0, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0xd, 0x2f, 0x49, 0x6b, 0x85, 0xa7, 0xc1, 0xe3}, + {0x0, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61}, + {0x0, 0xdf, 0xa3, 0x7c, 0x5b, 0x84, 0xf8, 0x27, 0xb6, 0x69, 0x15, 0xca, 0xed, 0x32, 0x4e, 0x91}, + {0x0, 0xef, 0xc3, 0x2c, 0x9b, 0x74, 0x58, 0xb7, 0x2b, 0xc4, 0xe8, 0x7, 0xb0, 0x5f, 0x73, 0x9c}, + {0x0, 0xff, 0xe3, 0x1c, 0xdb, 0x24, 0x38, 0xc7, 0xab, 0x54, 0x48, 0xb7, 0x70, 0x8f, 0x93, 0x6c}, + {0x0, 0x8f, 0x3, 0x8c, 0x6, 0x89, 0x5, 0x8a, 0xc, 0x83, 0xf, 0x80, 0xa, 0x85, 0x9, 0x86}, + {0x0, 0x9f, 0x23, 0xbc, 0x46, 0xd9, 0x65, 0xfa, 0x8c, 0x13, 0xaf, 0x30, 0xca, 0x55, 0xe9, 0x76}, + {0x0, 0xaf, 0x43, 0xec, 0x86, 0x29, 0xc5, 0x6a, 0x11, 0xbe, 0x52, 0xfd, 0x97, 0x38, 0xd4, 0x7b}, + {0x0, 0xbf, 0x63, 0xdc, 0xc6, 0x79, 0xa5, 0x1a, 0x91, 0x2e, 0xf2, 0x4d, 0x57, 0xe8, 0x34, 0x8b}, + {0x0, 0x4f, 0x9e, 0xd1, 0x21, 0x6e, 0xbf, 0xf0, 0x42, 0xd, 0xdc, 0x93, 0x63, 0x2c, 0xfd, 0xb2}, + {0x0, 0x5f, 0xbe, 0xe1, 0x61, 0x3e, 0xdf, 0x80, 0xc2, 0x9d, 0x7c, 0x23, 0xa3, 0xfc, 0x1d, 0x42}, + {0x0, 0x6f, 0xde, 0xb1, 0xa1, 0xce, 0x7f, 0x10, 0x5f, 0x30, 0x81, 0xee, 0xfe, 0x91, 0x20, 0x4f}, + {0x0, 0x7f, 0xfe, 0x81, 0xe1, 0x9e, 0x1f, 0x60, 0xdf, 0xa0, 0x21, 0x5e, 0x3e, 0x41, 0xc0, 0xbf}, + {0x0, 0xf, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55}, + {0x0, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5}, + {0x0, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x65, 0x4a, 0x3b, 0x14, 0xd9, 0xf6, 0x87, 0xa8}, + {0x0, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xe5, 0xda, 0x9b, 0xa4, 0x19, 0x26, 0x67, 0x58}, + {0x0, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x8, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67}, + {0x0, 0x8c, 0x5, 0x89, 0xa, 0x86, 0xf, 0x83, 0x14, 0x98, 0x11, 0x9d, 0x1e, 0x92, 0x1b, 0x97}, + {0x0, 0xbc, 0x65, 0xd9, 0xca, 0x76, 0xaf, 0x13, 0x89, 0x35, 0xec, 0x50, 0x43, 0xff, 0x26, 0x9a}, + {0x0, 0xac, 0x45, 0xe9, 0x8a, 0x26, 0xcf, 0x63, 0x9, 0xa5, 0x4c, 0xe0, 0x83, 0x2f, 0xc6, 0x6a}, + {0x0, 0xdc, 0xa5, 0x79, 0x57, 0x8b, 0xf2, 0x2e, 0xae, 0x72, 0xb, 0xd7, 0xf9, 0x25, 0x5c, 0x80}, + {0x0, 0xcc, 0x85, 0x49, 0x17, 0xdb, 0x92, 0x5e, 0x2e, 0xe2, 0xab, 0x67, 0x39, 0xf5, 0xbc, 0x70}, + {0x0, 0xfc, 0xe5, 0x19, 0xd7, 0x2b, 0x32, 0xce, 0xb3, 0x4f, 0x56, 0xaa, 0x64, 0x98, 0x81, 0x7d}, + {0x0, 0xec, 0xc5, 0x29, 0x97, 0x7b, 0x52, 0xbe, 0x33, 0xdf, 0xf6, 0x1a, 0xa4, 0x48, 0x61, 0x8d}, + {0x0, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4}, + {0x0, 0xc, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44}, + {0x0, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xfd, 0xc1, 0x85, 0xb9, 0xd, 0x31, 0x75, 0x49}, + {0x0, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x7d, 0x51, 0x25, 0x9, 0xcd, 0xe1, 0x95, 0xb9}, + {0x0, 0x5c, 0xb8, 0xe4, 0x6d, 0x31, 0xd5, 0x89, 0xda, 0x86, 0x62, 0x3e, 0xb7, 0xeb, 0xf, 0x53}, + {0x0, 0x4c, 0x98, 0xd4, 0x2d, 0x61, 0xb5, 0xf9, 0x5a, 0x16, 0xc2, 0x8e, 0x77, 0x3b, 0xef, 0xa3}, + {0x0, 0x7c, 0xf8, 0x84, 0xed, 0x91, 0x15, 0x69, 0xc7, 0xbb, 0x3f, 0x43, 0x2a, 0x56, 0xd2, 0xae}, + {0x0, 0x6c, 0xd8, 0xb4, 0xad, 0xc1, 0x75, 0x19, 0x47, 0x2b, 0x9f, 0xf3, 0xea, 0x86, 0x32, 0x5e}, + {0x0, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc}, + {0x0, 0x91, 0x3f, 0xae, 0x7e, 0xef, 0x41, 0xd0, 0xfc, 0x6d, 0xc3, 0x52, 0x82, 0x13, 0xbd, 0x2c}, + {0x0, 0xa1, 0x5f, 0xfe, 0xbe, 0x1f, 0xe1, 0x40, 0x61, 0xc0, 0x3e, 0x9f, 0xdf, 0x7e, 0x80, 0x21}, + {0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x30, 0xe1, 0x50, 0x9e, 0x2f, 0x1f, 0xae, 0x60, 0xd1}, + {0x0, 0xc1, 0x9f, 0x5e, 0x23, 0xe2, 0xbc, 0x7d, 0x46, 0x87, 0xd9, 0x18, 0x65, 0xa4, 0xfa, 0x3b}, + {0x0, 0xd1, 0xbf, 0x6e, 0x63, 0xb2, 0xdc, 0xd, 0xc6, 0x17, 0x79, 0xa8, 0xa5, 0x74, 0x1a, 0xcb}, + {0x0, 0xe1, 0xdf, 0x3e, 0xa3, 0x42, 0x7c, 0x9d, 0x5b, 0xba, 0x84, 0x65, 0xf8, 0x19, 0x27, 0xc6}, + {0x0, 0xf1, 0xff, 0xe, 0xe3, 0x12, 0x1c, 0xed, 0xdb, 0x2a, 0x24, 0xd5, 0x38, 0xc9, 0xc7, 0x36}, + {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf}, + {0x0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}, + {0x0, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x15, 0x34, 0x57, 0x76, 0x91, 0xb0, 0xd3, 0xf2}, + {0x0, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x95, 0xa4, 0xf7, 0xc6, 0x51, 0x60, 0x33, 0x2}, + {0x0, 0x41, 0x82, 0xc3, 0x19, 0x58, 0x9b, 0xda, 0x32, 0x73, 0xb0, 0xf1, 0x2b, 0x6a, 0xa9, 0xe8}, + {0x0, 0x51, 0xa2, 0xf3, 0x59, 0x8, 0xfb, 0xaa, 0xb2, 0xe3, 0x10, 0x41, 0xeb, 0xba, 0x49, 0x18}, + {0x0, 0x61, 0xc2, 0xa3, 0x99, 0xf8, 0x5b, 0x3a, 0x2f, 0x4e, 0xed, 0x8c, 0xb6, 0xd7, 0x74, 0x15}, + {0x0, 0x71, 0xe2, 0x93, 0xd9, 0xa8, 0x3b, 0x4a, 0xaf, 0xde, 0x4d, 0x3c, 0x76, 0x7, 0x94, 0xe5}, + {0x0, 0xa6, 0x51, 0xf7, 0xa2, 0x4, 0xf3, 0x55, 0x59, 0xff, 0x8, 0xae, 0xfb, 0x5d, 0xaa, 0xc}, + {0x0, 0xb6, 0x71, 0xc7, 0xe2, 0x54, 0x93, 0x25, 0xd9, 0x6f, 0xa8, 0x1e, 0x3b, 0x8d, 0x4a, 0xfc}, + {0x0, 0x86, 0x11, 0x97, 0x22, 0xa4, 0x33, 0xb5, 0x44, 0xc2, 0x55, 0xd3, 0x66, 0xe0, 0x77, 0xf1}, + {0x0, 0x96, 0x31, 0xa7, 0x62, 0xf4, 0x53, 0xc5, 0xc4, 0x52, 0xf5, 0x63, 0xa6, 0x30, 0x97, 0x1}, + {0x0, 0xe6, 0xd1, 0x37, 0xbf, 0x59, 0x6e, 0x88, 0x63, 0x85, 0xb2, 0x54, 0xdc, 0x3a, 0xd, 0xeb}, + {0x0, 0xf6, 0xf1, 0x7, 0xff, 0x9, 0xe, 0xf8, 0xe3, 0x15, 0x12, 0xe4, 0x1c, 0xea, 0xed, 0x1b}, + {0x0, 0xc6, 0x91, 0x57, 0x3f, 0xf9, 0xae, 0x68, 0x7e, 0xb8, 0xef, 0x29, 0x41, 0x87, 0xd0, 0x16}, + {0x0, 0xd6, 0xb1, 0x67, 0x7f, 0xa9, 0xce, 0x18, 0xfe, 0x28, 0x4f, 0x99, 0x81, 0x57, 0x30, 0xe6}, + {0x0, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x2d, 0xb, 0x61, 0x47, 0xb5, 0x93, 0xf9, 0xdf}, + {0x0, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xad, 0x9b, 0xc1, 0xf7, 0x75, 0x43, 0x19, 0x2f}, + {0x0, 0x6, 0xc, 0xa, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22}, + {0x0, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2}, + {0x0, 0x66, 0xcc, 0xaa, 0x85, 0xe3, 0x49, 0x2f, 0x17, 0x71, 0xdb, 0xbd, 0x92, 0xf4, 0x5e, 0x38}, + {0x0, 0x76, 0xec, 0x9a, 0xc5, 0xb3, 0x29, 0x5f, 0x97, 0xe1, 0x7b, 0xd, 0x52, 0x24, 0xbe, 0xc8}, + {0x0, 0x46, 0x8c, 0xca, 0x5, 0x43, 0x89, 0xcf, 0xa, 0x4c, 0x86, 0xc0, 0xf, 0x49, 0x83, 0xc5}, + {0x0, 0x56, 0xac, 0xfa, 0x45, 0x13, 0xe9, 0xbf, 0x8a, 0xdc, 0x26, 0x70, 0xcf, 0x99, 0x63, 0x35}, + {0x0, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x6, 0xb1, 0xa, 0xda, 0x61, 0x67, 0xdc, 0xc, 0xb7}, + {0x0, 0xab, 0x4b, 0xe0, 0x96, 0x3d, 0xdd, 0x76, 0x31, 0x9a, 0x7a, 0xd1, 0xa7, 0xc, 0xec, 0x47}, + {0x0, 0x9b, 0x2b, 0xb0, 0x56, 0xcd, 0x7d, 0xe6, 0xac, 0x37, 0x87, 0x1c, 0xfa, 0x61, 0xd1, 0x4a}, + {0x0, 0x8b, 0xb, 0x80, 0x16, 0x9d, 0x1d, 0x96, 0x2c, 0xa7, 0x27, 0xac, 0x3a, 0xb1, 0x31, 0xba}, + {0x0, 0xfb, 0xeb, 0x10, 0xcb, 0x30, 0x20, 0xdb, 0x8b, 0x70, 0x60, 0x9b, 0x40, 0xbb, 0xab, 0x50}, + {0x0, 0xeb, 0xcb, 0x20, 0x8b, 0x60, 0x40, 0xab, 0xb, 0xe0, 0xc0, 0x2b, 0x80, 0x6b, 0x4b, 0xa0}, + {0x0, 0xdb, 0xab, 0x70, 0x4b, 0x90, 0xe0, 0x3b, 0x96, 0x4d, 0x3d, 0xe6, 0xdd, 0x6, 0x76, 0xad}, + {0x0, 0xcb, 0x8b, 0x40, 0xb, 0xc0, 0x80, 0x4b, 0x16, 0xdd, 0x9d, 0x56, 0x1d, 0xd6, 0x96, 0x5d}, + {0x0, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xc5, 0xfe, 0xb3, 0x88, 0x29, 0x12, 0x5f, 0x64}, + {0x0, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x45, 0x6e, 0x13, 0x38, 0xe9, 0xc2, 0xbf, 0x94}, + {0x0, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99}, + {0x0, 0xb, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69}, + {0x0, 0x7b, 0xf6, 0x8d, 0xf1, 0x8a, 0x7, 0x7c, 0xff, 0x84, 0x9, 0x72, 0xe, 0x75, 0xf8, 0x83}, + {0x0, 0x6b, 0xd6, 0xbd, 0xb1, 0xda, 0x67, 0xc, 0x7f, 0x14, 0xa9, 0xc2, 0xce, 0xa5, 0x18, 0x73}, + {0x0, 0x5b, 0xb6, 0xed, 0x71, 0x2a, 0xc7, 0x9c, 0xe2, 0xb9, 0x54, 0xf, 0x93, 0xc8, 0x25, 0x7e}, + {0x0, 0x4b, 0x96, 0xdd, 0x31, 0x7a, 0xa7, 0xec, 0x62, 0x29, 0xf4, 0xbf, 0x53, 0x18, 0xc5, 0x8e}} + +// galMultiply multiplies to elements of the field. +// Uses lookup table ~40% faster +func galMultiply(a, b byte) byte { + return mulTable[a][b] +} + +// Original function: +/* +// galMultiply multiplies to elements of the field. +func galMultiply(a, b byte) byte { + if a == 0 || b == 0 { + return 0 + } + logA := int(logTable[a]) + logB := int(logTable[b]) + return expTable[logA+logB] +} +*/ + +// galDivide is inverse of galMultiply. +func galDivide(a, b byte) byte { + if a == 0 { + return 0 + } + if b == 0 { + panic("Argument 'divisor' is 0") + } + logA := int(logTable[a]) + logB := int(logTable[b]) + logResult := logA - logB + if logResult < 0 { + logResult += 255 + } + return expTable[uint8(logResult)] +} + +// galOneOver is the same as galDivide(1, a). +func galOneOver(a byte) byte { + if a == 0 { + panic("Argument 'divisor' is 0") + } + logResult := logTable[a] ^ 255 + return expTable[logResult] +} + +// Computes a**n. +// +// The result will be the same as multiplying a times itself n times. +func galExp(a byte, n int) byte { + if n == 0 { + return 1 + } + if a == 0 { + return 0 + } + + logA := logTable[a] + logResult := int(logA) * n + for logResult >= 255 { + logResult -= 255 + } + return expTable[uint8(logResult)] +} + +func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte { + if !avx2CodeGen { + panic("codegen not enabled") + } + total := inputs * outputs + + // Duplicated in+out + wantBytes := total * 32 * 2 + if cap(dst) < wantBytes { + dst = AllocAligned(1, wantBytes)[0] + } else { + dst = dst[:wantBytes] + } + for i, row := range matrixRows[:outputs] { + for j, idx := range row[inIdx : inIdx+inputs] { + dstIdx := (j*outputs + i) * 64 + dstPart := dst[dstIdx:] + dstPart = dstPart[:64] + lo := mulTableLow[idx][:] + hi := mulTableHigh[idx][:] + copy(dstPart[:16], lo) + copy(dstPart[16:32], lo) + copy(dstPart[32:48], hi) + copy(dstPart[48:64], hi) + } + } + return dst +} + +var gf2p811dMulMatrices = [256]uint64{0, 0x102040810204080, 0x8001828488102040, 0x8103868c983060c0, 0x408041c2c4881020, 0x418245cad4a850a0, 0xc081c3464c983060, 0xc183c74e5cb870e0, 0x2040a061e2c48810, 0x2142a469f2e4c890, 0xa04122e56ad4a850, 0xa14326ed7af4e8d0, 0x60c0e1a3264c9830, 0x61c2e5ab366cd8b0, 0xe0c16327ae5cb870, 0xe1c3672fbe7cf8f0, 0x102050b071e2c488, 0x112254b861c28408, 0x9021d234f9f2e4c8, 0x9123d63ce9d2a448, 0x50a01172b56ad4a8, 0x51a2157aa54a9428, 0xd0a193f63d7af4e8, 0xd1a397fe2d5ab468, 0x3060f0d193264c98, 0x3162f4d983060c18, 0xb06172551b366cd8, 0xb163765d0b162c58, 0x70e0b11357ae5cb8, 0x71e2b51b478e1c38, 0xf0e13397dfbe7cf8, 0xf1e3379fcf9e3c78, 0x8810a8d83871e2c4, 0x8912acd02851a244, 0x8112a5cb061c284, 0x9132e54a0418204, 0xc890e91afcf9f2e4, 0xc992ed12ecd9b264, 0x48916b9e74e9d2a4, 0x49936f9664c99224, 0xa85008b9dab56ad4, 0xa9520cb1ca952a54, 0x28518a3d52a54a94, 0x29538e3542850a14, 0xe8d0497b1e3d7af4, 0xe9d24d730e1d3a74, 0x68d1cbff962d5ab4, 0x69d3cff7860d1a34, 0x9830f8684993264c, 0x9932fc6059b366cc, 0x18317aecc183060c, 0x19337ee4d1a3468c, 0xd8b0b9aa8d1b366c, 0xd9b2bda29d3b76ec, 0x58b13b2e050b162c, 0x59b33f26152b56ac, 0xb8705809ab57ae5c, 0xb9725c01bb77eedc, 0x3871da8d23478e1c, 0x3973de853367ce9c, 0xf8f019cb6fdfbe7c, 0xf9f21dc37ffffefc, 0x78f19b4fe7cf9e3c, 0x79f39f47f7efdebc, 0xc488d46c1c3871e2, 0xc58ad0640c183162, 0x448956e8942851a2, 0x458b52e084081122, 0x840895aed8b061c2, 0x850a91a6c8902142, 0x409172a50a04182, 0x50b132240800102, 0xe4c8740dfefcf9f2, 0xe5ca7005eedcb972, 0x64c9f68976ecd9b2, 0x65cbf28166cc9932, 0xa44835cf3a74e9d2, 0xa54a31c72a54a952, 0x2449b74bb264c992, 0x254bb343a2448912, 0xd4a884dc6ddab56a, 0xd5aa80d47dfaf5ea, 0x54a90658e5ca952a, 0x55ab0250f5ead5aa, 0x9428c51ea952a54a, 0x952ac116b972e5ca, 0x1429479a2142850a, 0x152b43923162c58a, 0xf4e824bd8f1e3d7a, 0xf5ea20b59f3e7dfa, 0x74e9a639070e1d3a, 0x75eba231172e5dba, 0xb468657f4b962d5a, 0xb56a61775bb66dda, 0x3469e7fbc3860d1a, 0x356be3f3d3a64d9a, 0x4c987cb424499326, 0x4d9a78bc3469d3a6, 0xcc99fe30ac59b366, 0xcd9bfa38bc79f3e6, 0xc183d76e0c18306, 0xd1a397ef0e1c386, 0x8c19bff268d1a346, 0x8d1bbbfa78f1e3c6, 0x6cd8dcd5c68d1b36, 0x6ddad8ddd6ad5bb6, 0xecd95e514e9d3b76, 0xeddb5a595ebd7bf6, 0x2c589d1702050b16, 0x2d5a991f12254b96, 0xac591f938a152b56, 0xad5b1b9b9a356bd6, 0x5cb82c0455ab57ae, 0x5dba280c458b172e, 0xdcb9ae80ddbb77ee, 0xddbbaa88cd9b376e, 0x1c386dc69123478e, 0x1d3a69ce8103070e, 0x9c39ef42193367ce, 0x9d3beb4a0913274e, 0x7cf88c65b76fdfbe, 0x7dfa886da74f9f3e, 0xfcf90ee13f7ffffe, 0xfdfb0ae92f5fbf7e, 0x3c78cda773e7cf9e, 0x3d7ac9af63c78f1e, 0xbc794f23fbf7efde, 0xbd7b4b2bebd7af5e, 0xe2c46a368e1c3871, 0xe3c66e3e9e3c78f1, 0x62c5e8b2060c1831, 0x63c7ecba162c58b1, 0xa2442bf44a942851, 0xa3462ffc5ab468d1, 0x2245a970c2840811, 0x2347ad78d2a44891, 0xc284ca576cd8b061, 0xc386ce5f7cf8f0e1, 0x428548d3e4c89021, 0x43874cdbf4e8d0a1, 0x82048b95a850a041, 0x83068f9db870e0c1, 0x205091120408001, 0x3070d193060c081, 0xf2e43a86fffefcf9, 0xf3e63e8eefdebc79, 0x72e5b80277eedcb9, 0x73e7bc0a67ce9c39, 0xb2647b443b76ecd9, 0xb3667f4c2b56ac59, 0x3265f9c0b366cc99, 0x3367fdc8a3468c19, 0xd2a49ae71d3a74e9, 0xd3a69eef0d1a3469, 0x52a51863952a54a9, 0x53a71c6b850a1429, 0x9224db25d9b264c9, 0x9326df2dc9922449, 0x122559a151a24489, 0x13275da941820409, 0x6ad4c2eeb66ddab5, 0x6bd6c6e6a64d9a35, 0xead5406a3e7dfaf5, 0xebd744622e5dba75, 0x2a54832c72e5ca95, 0x2b56872462c58a15, 0xaa5501a8faf5ead5, 0xab5705a0ead5aa55, 0x4a94628f54a952a5, 0x4b96668744891225, 0xca95e00bdcb972e5, 0xcb97e403cc993265, 0xa14234d90214285, 0xb16274580010205, 0x8a15a1c9183162c5, 0x8b17a5c108112245, 0x7af4925ec78f1e3d, 0x7bf69656d7af5ebd, 0xfaf510da4f9f3e7d, 0xfbf714d25fbf7efd, 0x3a74d39c03070e1d, 0x3b76d79413274e9d, 0xba7551188b172e5d, 0xbb7755109b376edd, 0x5ab4323f254b962d, 0x5bb63637356bd6ad, 0xdab5b0bbad5bb66d, 0xdbb7b4b3bd7bf6ed, 0x1a3473fde1c3860d, 0x1b3677f5f1e3c68d, 0x9a35f17969d3a64d, 0x9b37f57179f3e6cd, 0x264cbe5a92244993, 0x274eba5282040913, 0xa64d3cde1a3469d3, 0xa74f38d60a142953, 0x66ccff9856ac59b3, 0x67cefb90468c1933, 0xe6cd7d1cdebc79f3, 0xe7cf7914ce9c3973, 0x60c1e3b70e0c183, 0x70e1a3360c08103, 0x860d9cbff8f0e1c3, 0x870f98b7e8d0a143, 0x468c5ff9b468d1a3, 0x478e5bf1a4489123, 0xc68ddd7d3c78f1e3, 0xc78fd9752c58b163, 0x366ceeeae3c68d1b, 0x376eeae2f3e6cd9b, 0xb66d6c6e6bd6ad5b, 0xb76f68667bf6eddb, 0x76ecaf28274e9d3b, 0x77eeab20376eddbb, 0xf6ed2dacaf5ebd7b, 0xf7ef29a4bf7efdfb, 0x162c4e8b0102050b, 0x172e4a831122458b, 0x962dcc0f8912254b, 0x972fc807993265cb, 0x56ac0f49c58a152b, 0x57ae0b41d5aa55ab, 0xd6ad8dcd4d9a356b, 0xd7af89c55dba75eb, 0xae5c1682aa55ab57, 0xaf5e128aba75ebd7, 0x2e5d940622458b17, 0x2f5f900e3265cb97, 0xeedc57406eddbb77, 0xefde53487efdfbf7, 0x6eddd5c4e6cd9b37, 0x6fdfd1ccf6eddbb7, 0x8e1cb6e348912347, 0x8f1eb2eb58b163c7, 0xe1d3467c0810307, 0xf1f306fd0a14387, 0xce9cf7218c193367, 0xcf9ef3299c3973e7, 0x4e9d75a504091327, 0x4f9f71ad142953a7, 0xbe7c4632dbb76fdf, 0xbf7e423acb972f5f, 0x3e7dc4b653a74f9f, 0x3f7fc0be43870f1f, 0xfefc07f01f3f7fff, 0xfffe03f80f1f3f7f, 0x7efd8574972f5fbf, 0x7fff817c870f1f3f, 0x9e3ce6533973e7cf, 0x9f3ee25b2953a74f, 0x1e3d64d7b163c78f, 0x1f3f60dfa143870f, 0xdebca791fdfbf7ef, 0xdfbea399eddbb76f, 0x5ebd251575ebd7af, 0x5fbf211d65cb972f} + +func genGFNIMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []uint64) []uint64 { + if !avx2CodeGen { + panic("codegen not enabled") + } + total := inputs * outputs + + // Duplicated in+out + dst = dst[:total] + for i, row := range matrixRows[:outputs] { + for j, idx := range row[inIdx : inIdx+inputs] { + dst[j*outputs+i] = gf2p811dMulMatrices[idx] + } + } + return dst +} + +// xor slices writing to out. +func sliceXorGo(in, out []byte, _ *options) { + for len(out) >= 32 { + inS := in[:32] + v0 := binary.LittleEndian.Uint64(out[:8]) ^ binary.LittleEndian.Uint64(inS[:8]) + v1 := binary.LittleEndian.Uint64(out[8:16]) ^ binary.LittleEndian.Uint64(inS[8:16]) + v2 := binary.LittleEndian.Uint64(out[16:24]) ^ binary.LittleEndian.Uint64(inS[16:24]) + v3 := binary.LittleEndian.Uint64(out[24:32]) ^ binary.LittleEndian.Uint64(inS[24:32]) + binary.LittleEndian.PutUint64(out[:8], v0) + binary.LittleEndian.PutUint64(out[8:16], v1) + binary.LittleEndian.PutUint64(out[16:24], v2) + binary.LittleEndian.PutUint64(out[24:32], v3) + out = out[32:] + in = in[32:] + } + out = out[:len(in)] + for n, input := range in { + out[n] ^= input + } +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go new file mode 100644 index 000000000..8099f1664 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go @@ -0,0 +1,583 @@ +//go:build !noasm && !appengine && !gccgo && !nopshufb + +// Copyright 2015, Klaus Post, see LICENSE for details. + +package reedsolomon + +const pshufb = true + +//go:noescape +func galMulSSSE3(low, high, in, out []byte) + +//go:noescape +func galMulSSSE3Xor(low, high, in, out []byte) + +//go:noescape +func galMulAVX2Xor(low, high, in, out []byte) + +//go:noescape +func galMulAVX2(low, high, in, out []byte) + +//go:noescape +func galMulAVX2Xor_64(low, high, in, out []byte) + +//go:noescape +func galMulAVX2_64(low, high, in, out []byte) + +// This is what the assembler routines do in blocks of 16 bytes: +/* +func galMulSSSE3(low, high, in, out []byte) { + for n, input := range in { + l := input & 0xf + h := input >> 4 + out[n] = low[l] ^ high[h] + } +} + +func galMulSSSE3Xor(low, high, in, out []byte) { + for n, input := range in { + l := input & 0xf + h := input >> 4 + out[n] ^= low[l] ^ high[h] + } +} +*/ + +// bigSwitchover is the size where 64 bytes are processed per loop. +const bigSwitchover = 128 + +func galMulSlice(c byte, in, out []byte, o *options) { + if c == 1 { + copy(out, in) + return + } + if o.useAVX2 { + if len(in) >= bigSwitchover { + galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + if len(in) > 32 { + galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 5) << 5 + in = in[done:] + out = out[done:] + } + } else if o.useSSSE3 { + galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] + } + out = out[:len(in)] + mt := mulTable[c][:256] + for i := range in { + out[i] = mt[in[i]] + } +} + +func galMulSliceXor(c byte, in, out []byte, o *options) { + if c == 1 { + sliceXor(in, out, o) + return + } + + if o.useAVX2 { + if len(in) >= bigSwitchover { + galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + if len(in) >= 32 { + galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 5) << 5 + in = in[done:] + out = out[done:] + } + } else if o.useSSSE3 { + galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] + } + if len(in) == 0 { + return + } + out = out[:len(in)] + mt := mulTable[c][:256] + for i := range in { + out[i] ^= mt[in[i]] + } +} + +// simple slice xor +func sliceXor(in, out []byte, o *options) { + if o.useSSE2 { + if len(in) >= bigSwitchover { + if o.useAVX2 { + avx2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } else { + sSE2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + } + if len(in) >= 16 { + sSE2XorSlice(in, out) + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] + } + } else { + sliceXorGo(in, out, o) + return + } + out = out[:len(in)] + for i := range in { + out[i] ^= in[i] + } +} + +// 4-way butterfly +func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + if len(work[0]) == 0 { + return + } + + t01 := &multiply256LUT[log_m01] + t23 := &multiply256LUT[log_m23] + t02 := &multiply256LUT[log_m02] + if o.useAVX512 { + if log_m01 == modulus { + if log_m23 == modulus { + if log_m02 == modulus { + ifftDIT4_avx512_7(work, dist*24, t01, t23, t02) + } else { + ifftDIT4_avx512_3(work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus { + ifftDIT4_avx512_5(work, dist*24, t01, t23, t02) + } else { + ifftDIT4_avx512_1(work, dist*24, t01, t23, t02) + } + } + } else { + if log_m23 == modulus { + if log_m02 == modulus { + ifftDIT4_avx512_6(work, dist*24, t01, t23, t02) + } else { + ifftDIT4_avx512_2(work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus { + ifftDIT4_avx512_4(work, dist*24, t01, t23, t02) + } else { + ifftDIT4_avx512_0(work, dist*24, t01, t23, t02) + } + } + } + return + } else if o.useAVX2 { + if log_m01 == modulus { + if log_m23 == modulus { + if log_m02 == modulus { + ifftDIT4_avx2_7(work, dist*24, t01, t23, t02) + } else { + ifftDIT4_avx2_3(work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus { + ifftDIT4_avx2_5(work, dist*24, t01, t23, t02) + } else { + ifftDIT4_avx2_1(work, dist*24, t01, t23, t02) + } + } + } else { + if log_m23 == modulus { + if log_m02 == modulus { + ifftDIT4_avx2_6(work, dist*24, t01, t23, t02) + } else { + ifftDIT4_avx2_2(work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus { + ifftDIT4_avx2_4(work, dist*24, t01, t23, t02) + } else { + ifftDIT4_avx2_0(work, dist*24, t01, t23, t02) + } + } + } + return + } + ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + if len(work[0]) == 0 { + return + } + + if false && o.useAvx512GFNI { + // Note that these currently require that length is multiple of 64. + t01 := gf2p811dMulMatrices[log_m01] + t23 := gf2p811dMulMatrices[log_m23] + t02 := gf2p811dMulMatrices[log_m02] + if log_m01 == modulus8 { + if log_m23 == modulus8 { + if log_m02 == modulus8 { + ifftDIT48_gfni_7(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_gfni_3(work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus8 { + ifftDIT48_gfni_5(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_gfni_1(work, dist*24, t01, t23, t02) + } + } + } else { + if log_m23 == modulus8 { + if log_m02 == modulus8 { + ifftDIT48_gfni_6(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_gfni_2(work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus8 { + ifftDIT48_gfni_4(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_gfni_0(work, dist*24, t01, t23, t02) + } + } + } + return + } + if o.useAVX2 { + // Note that these currently require that length is multiple of 64. + t01 := &multiply256LUT8[log_m01] + t23 := &multiply256LUT8[log_m23] + t02 := &multiply256LUT8[log_m02] + if log_m01 == modulus8 { + if log_m23 == modulus8 { + if log_m02 == modulus8 { + ifftDIT48_avx2_7(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_avx2_3(work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus8 { + ifftDIT48_avx2_5(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_avx2_1(work, dist*24, t01, t23, t02) + } + } + } else { + if log_m23 == modulus8 { + if log_m02 == modulus8 { + ifftDIT48_avx2_6(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_avx2_2(work, dist*24, t01, t23, t02) + } + } else { + if log_m02 == modulus8 { + ifftDIT48_avx2_4(work, dist*24, t01, t23, t02) + } else { + ifftDIT48_avx2_0(work, dist*24, t01, t23, t02) + } + } + } + return + } + ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + if len(work[0]) == 0 { + return + } + + t01 := &multiply256LUT[log_m01] + t23 := &multiply256LUT[log_m23] + t02 := &multiply256LUT[log_m02] + if o.useAVX512 { + if log_m02 == modulus { + if log_m01 == modulus { + if log_m23 == modulus { + fftDIT4_avx512_7(work, dist*24, t01, t23, t02) + } else { + fftDIT4_avx512_3(work, dist*24, t01, t23, t02) + } + } else { + if log_m23 == modulus { + fftDIT4_avx512_5(work, dist*24, t01, t23, t02) + } else { + fftDIT4_avx512_1(work, dist*24, t01, t23, t02) + } + } + } else { + if log_m01 == modulus { + if log_m23 == modulus { + fftDIT4_avx512_6(work, dist*24, t01, t23, t02) + } else { + fftDIT4_avx512_2(work, dist*24, t01, t23, t02) + } + } else { + if log_m23 == modulus { + fftDIT4_avx512_4(work, dist*24, t01, t23, t02) + } else { + fftDIT4_avx512_0(work, dist*24, t01, t23, t02) + } + } + } + return + } else if o.useAVX2 { + if log_m02 == modulus { + if log_m01 == modulus { + if log_m23 == modulus { + fftDIT4_avx2_7(work, dist*24, t01, t23, t02) + } else { + fftDIT4_avx2_3(work, dist*24, t01, t23, t02) + } + } else { + if log_m23 == modulus { + fftDIT4_avx2_5(work, dist*24, t01, t23, t02) + } else { + fftDIT4_avx2_1(work, dist*24, t01, t23, t02) + } + } + } else { + if log_m01 == modulus { + if log_m23 == modulus { + fftDIT4_avx2_6(work, dist*24, t01, t23, t02) + } else { + fftDIT4_avx2_2(work, dist*24, t01, t23, t02) + } + } else { + if log_m23 == modulus { + fftDIT4_avx2_4(work, dist*24, t01, t23, t02) + } else { + fftDIT4_avx2_0(work, dist*24, t01, t23, t02) + } + } + } + return + } + fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + if len(work[0]) == 0 { + return + } + + if false && o.useAvx512GFNI { + t01 := gf2p811dMulMatrices[log_m01] + t23 := gf2p811dMulMatrices[log_m23] + t02 := gf2p811dMulMatrices[log_m02] + // Note that these currently require that length is multiple of 64. + if log_m02 == modulus8 { + if log_m01 == modulus8 { + if log_m23 == modulus8 { + fftDIT48_gfni_7(work, dist*24, t01, t23, t02) + } else { + fftDIT48_gfni_3(work, dist*24, t01, t23, t02) + } + } else { + if log_m23 == modulus8 { + fftDIT48_gfni_5(work, dist*24, t01, t23, t02) + } else { + fftDIT48_gfni_1(work, dist*24, t01, t23, t02) + } + } + } else { + if log_m01 == modulus8 { + if log_m23 == modulus8 { + fftDIT48_gfni_6(work, dist*24, t01, t23, t02) + } else { + fftDIT48_gfni_2(work, dist*24, t01, t23, t02) + } + } else { + if log_m23 == modulus8 { + fftDIT48_gfni_4(work, dist*24, t01, t23, t02) + } else { + fftDIT48_gfni_0(work, dist*24, t01, t23, t02) + } + } + } + return + } + if o.useAVX2 { + t01 := &multiply256LUT8[log_m01] + t23 := &multiply256LUT8[log_m23] + t02 := &multiply256LUT8[log_m02] + // Note that these currently require that length is multiple of 64. + if log_m02 == modulus8 { + if log_m01 == modulus8 { + if log_m23 == modulus8 { + fftDIT48_avx2_7(work, dist*24, t01, t23, t02) + } else { + fftDIT48_avx2_3(work, dist*24, t01, t23, t02) + } + } else { + if log_m23 == modulus8 { + fftDIT48_avx2_5(work, dist*24, t01, t23, t02) + } else { + fftDIT48_avx2_1(work, dist*24, t01, t23, t02) + } + } + } else { + if log_m01 == modulus8 { + if log_m23 == modulus8 { + fftDIT48_avx2_6(work, dist*24, t01, t23, t02) + } else { + fftDIT48_avx2_2(work, dist*24, t01, t23, t02) + } + } else { + if log_m23 == modulus8 { + fftDIT48_avx2_4(work, dist*24, t01, t23, t02) + } else { + fftDIT48_avx2_0(work, dist*24, t01, t23, t02) + } + } + } + return + } + fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 2-way butterfly forward +func fftDIT2(x, y []byte, log_m ffe, o *options) { + if len(x) == 0 { + return + } + if o.useAVX2 { + tmp := &multiply256LUT[log_m] + fftDIT2_avx2(x, y, tmp) + } else if o.useSSSE3 { + tmp := &multiply256LUT[log_m] + fftDIT2_ssse3(x, y, tmp) + } else { + // Reference version: + refMulAdd(x, y, log_m) + sliceXor(x, y, o) + } +} + +// 2-way butterfly forward +func fftDIT28(x, y []byte, log_m ffe8, o *options) { + if len(x) == 0 { + return + } + + if o.useAVX2 { + fftDIT28_avx2(x, y, &multiply256LUT8[log_m]) + if len(x)&63 == 0 { + return + } + done := (len(y) >> 6) << 6 + y = y[done:] + x = x[done:] + } + mulAdd8(x, y, log_m, o) + sliceXor(x, y, o) +} + +// 2-way butterfly inverse +func ifftDIT28(x, y []byte, log_m ffe8, o *options) { + if len(x) == 0 { + return + } + + if o.useAVX2 { + ifftDIT28_avx2(x, y, &multiply256LUT8[log_m]) + if len(x)&63 == 0 { + return + } + done := (len(y) >> 6) << 6 + y = y[done:] + x = x[done:] + } + sliceXor(x, y, o) + mulAdd8(x, y, log_m, o) +} + +func mulAdd8(x, y []byte, log_m ffe8, o *options) { + if o.useAVX2 { + t := &multiply256LUT8[log_m] + galMulAVX2Xor_64(t[:16], t[16:32], y, x) + done := (len(y) >> 6) << 6 + y = y[done:] + x = x[done:] + } else if o.useSSSE3 { + t := &multiply256LUT8[log_m] + galMulSSSE3Xor(t[:16], t[16:32], y, x) + done := (len(y) >> 4) << 4 + y = y[done:] + x = x[done:] + } + refMulAdd8(x, y, log_m) +} + +// 2-way butterfly +func ifftDIT2(x, y []byte, log_m ffe, o *options) { + if len(x) == 0 { + return + } + if o.useAVX2 { + tmp := &multiply256LUT[log_m] + ifftDIT2_avx2(x, y, tmp) + } else if o.useSSSE3 { + tmp := &multiply256LUT[log_m] + ifftDIT2_ssse3(x, y, tmp) + } else { + // Reference version: + sliceXor(x, y, o) + refMulAdd(x, y, log_m) + } +} + +func mulgf16(x, y []byte, log_m ffe, o *options) { + if len(x) == 0 { + return + } + if o.useAVX2 { + tmp := &multiply256LUT[log_m] + mulgf16_avx2(x, y, tmp) + } else if o.useSSSE3 { + tmp := &multiply256LUT[log_m] + mulgf16_ssse3(x, y, tmp) + } else { + refMul(x, y, log_m) + } +} + +func mulgf8(out, in []byte, log_m ffe8, o *options) { + if o.useAVX2 { + t := &multiply256LUT8[log_m] + galMulAVX2_64(t[:16], t[16:32], in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } else if o.useSSSE3 { + t := &multiply256LUT8[log_m] + galMulSSSE3(t[:16], t[16:32], in, out) + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] + } + out = out[:len(in)] + mt := mul8LUTs[log_m].Value[:] + for i := range in { + out[i] = byte(mt[in[i]]) + } +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_amd64.s b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s new file mode 100644 index 000000000..18e08c316 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s @@ -0,0 +1,310 @@ +//+build !noasm +//+build !appengine +//+build !gccgo +//+build !nopshufb + +// Copyright 2015, Klaus Post, see LICENSE for details. + +// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf +// and http://jerasure.org/jerasure/gf-complete/tree/master + +// func galMulSSSE3Xor(low, high, in, out []byte) +TEXT ·galMulSSSE3Xor(SB), 7, $0 + MOVQ low+0(FP), SI // SI: &low + MOVQ high+24(FP), DX // DX: &high + MOVOU (SI), X6 // X6 low + MOVOU (DX), X7 // X7: high + MOVQ $15, BX // BX: low mask + MOVQ BX, X8 + PXOR X5, X5 + MOVQ in+48(FP), SI // R11: &in + MOVQ in_len+56(FP), R9 // R9: len(in) + MOVQ out+72(FP), DX // DX: &out + PSHUFB X5, X8 // X8: lomask (unpacked) + SHRQ $4, R9 // len(in) / 16 + MOVQ SI, AX + MOVQ DX, BX + ANDQ $15, AX + ANDQ $15, BX + CMPQ R9, $0 + JEQ done_xor + ORQ AX, BX + CMPQ BX, $0 + JNZ loopback_xor + +loopback_xor_aligned: + MOVOA (SI), X0 // in[x] + MOVOA (DX), X4 // out[x] + MOVOA X0, X1 // in[x] + MOVOA X6, X2 // low copy + MOVOA X7, X3 // high copy + PSRLQ $4, X1 // X1: high input + PAND X8, X0 // X0: low input + PAND X8, X1 // X0: high input + PSHUFB X0, X2 // X2: mul low part + PSHUFB X1, X3 // X3: mul high part + PXOR X2, X3 // X3: Result + PXOR X4, X3 // X3: Result xor existing out + MOVOA X3, (DX) // Store + ADDQ $16, SI // in+=16 + ADDQ $16, DX // out+=16 + SUBQ $1, R9 + JNZ loopback_xor_aligned + JMP done_xor + +loopback_xor: + MOVOU (SI), X0 // in[x] + MOVOU (DX), X4 // out[x] + MOVOU X0, X1 // in[x] + MOVOU X6, X2 // low copy + MOVOU X7, X3 // high copy + PSRLQ $4, X1 // X1: high input + PAND X8, X0 // X0: low input + PAND X8, X1 // X0: high input + PSHUFB X0, X2 // X2: mul low part + PSHUFB X1, X3 // X3: mul high part + PXOR X2, X3 // X3: Result + PXOR X4, X3 // X3: Result xor existing out + MOVOU X3, (DX) // Store + ADDQ $16, SI // in+=16 + ADDQ $16, DX // out+=16 + SUBQ $1, R9 + JNZ loopback_xor + +done_xor: + RET + +// func galMulSSSE3(low, high, in, out []byte) +TEXT ·galMulSSSE3(SB), 7, $0 + MOVQ low+0(FP), SI // SI: &low + MOVQ high+24(FP), DX // DX: &high + MOVOU (SI), X6 // X6 low + MOVOU (DX), X7 // X7: high + MOVQ $15, BX // BX: low mask + MOVQ BX, X8 + PXOR X5, X5 + MOVQ in+48(FP), SI // R11: &in + MOVQ in_len+56(FP), R9 // R9: len(in) + MOVQ out+72(FP), DX // DX: &out + PSHUFB X5, X8 // X8: lomask (unpacked) + MOVQ SI, AX + MOVQ DX, BX + SHRQ $4, R9 // len(in) / 16 + ANDQ $15, AX + ANDQ $15, BX + CMPQ R9, $0 + JEQ done + ORQ AX, BX + CMPQ BX, $0 + JNZ loopback + +loopback_aligned: + MOVOA (SI), X0 // in[x] + MOVOA X0, X1 // in[x] + MOVOA X6, X2 // low copy + MOVOA X7, X3 // high copy + PSRLQ $4, X1 // X1: high input + PAND X8, X0 // X0: low input + PAND X8, X1 // X0: high input + PSHUFB X0, X2 // X2: mul low part + PSHUFB X1, X3 // X3: mul high part + PXOR X2, X3 // X3: Result + MOVOA X3, (DX) // Store + ADDQ $16, SI // in+=16 + ADDQ $16, DX // out+=16 + SUBQ $1, R9 + JNZ loopback_aligned + JMP done + +loopback: + MOVOU (SI), X0 // in[x] + MOVOU X0, X1 // in[x] + MOVOA X6, X2 // low copy + MOVOA X7, X3 // high copy + PSRLQ $4, X1 // X1: high input + PAND X8, X0 // X0: low input + PAND X8, X1 // X0: high input + PSHUFB X0, X2 // X2: mul low part + PSHUFB X1, X3 // X3: mul high part + PXOR X2, X3 // X3: Result + MOVOU X3, (DX) // Store + ADDQ $16, SI // in+=16 + ADDQ $16, DX // out+=16 + SUBQ $1, R9 + JNZ loopback + +done: + RET + +// func galMulAVX2Xor(low, high, in, out []byte) +TEXT ·galMulAVX2Xor(SB), 7, $0 + MOVQ low+0(FP), SI // SI: &low + MOVQ high+24(FP), DX // DX: &high + MOVQ $15, BX // BX: low mask + MOVQ BX, X5 + MOVOU (SI), X6 // X6: low + MOVOU (DX), X7 // X7: high + MOVQ in_len+56(FP), R9 // R9: len(in) + + VINSERTI128 $1, X6, Y6, Y6 // low + VINSERTI128 $1, X7, Y7, Y7 // high + VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) + + SHRQ $5, R9 // len(in) / 32 + MOVQ out+72(FP), DX // DX: &out + MOVQ in+48(FP), SI // SI: &in + TESTQ R9, R9 + JZ done_xor_avx2 + +loopback_xor_avx2: + VMOVDQU (SI), Y0 + VMOVDQU (DX), Y4 + VPSRLQ $4, Y0, Y1 // Y1: high input + VPAND Y8, Y0, Y0 // Y0: low input + VPAND Y8, Y1, Y1 // Y1: high input + VPSHUFB Y0, Y6, Y2 // Y2: mul low part + VPSHUFB Y1, Y7, Y3 // Y3: mul high part + VPXOR Y3, Y2, Y3 // Y3: Result + VPXOR Y4, Y3, Y4 // Y4: Result + VMOVDQU Y4, (DX) + + ADDQ $32, SI // in+=32 + ADDQ $32, DX // out+=32 + SUBQ $1, R9 + JNZ loopback_xor_avx2 + +done_xor_avx2: + VZEROUPPER + RET + +// func galMulAVX2(low, high, in, out []byte) +TEXT ·galMulAVX2(SB), 7, $0 + MOVQ low+0(FP), SI // SI: &low + MOVQ high+24(FP), DX // DX: &high + MOVQ $15, BX // BX: low mask + MOVQ BX, X5 + MOVOU (SI), X6 // X6: low + MOVOU (DX), X7 // X7: high + MOVQ in_len+56(FP), R9 // R9: len(in) + + VINSERTI128 $1, X6, Y6, Y6 // low + VINSERTI128 $1, X7, Y7, Y7 // high + VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) + + SHRQ $5, R9 // len(in) / 32 + MOVQ out+72(FP), DX // DX: &out + MOVQ in+48(FP), SI // SI: &in + TESTQ R9, R9 + JZ done_avx2 + +loopback_avx2: + VMOVDQU (SI), Y0 + VPSRLQ $4, Y0, Y1 // Y1: high input + VPAND Y8, Y0, Y0 // Y0: low input + VPAND Y8, Y1, Y1 // Y1: high input + VPSHUFB Y0, Y6, Y2 // Y2: mul low part + VPSHUFB Y1, Y7, Y3 // Y3: mul high part + VPXOR Y3, Y2, Y4 // Y4: Result + VMOVDQU Y4, (DX) + + ADDQ $32, SI // in+=32 + ADDQ $32, DX // out+=32 + SUBQ $1, R9 + JNZ loopback_avx2 + +done_avx2: + VZEROUPPER + RET + +// func galMulAVX2Xor_64(low, high, in, out []byte) +TEXT ·galMulAVX2Xor_64(SB), 7, $0 + MOVQ low+0(FP), SI // SI: &low + MOVQ high+24(FP), DX // DX: &high + MOVQ $15, BX // BX: low mask + MOVQ BX, X5 + MOVQ in_len+56(FP), R9 // R9: len(in) + + VBROADCASTI128 (SI), Y6 // low table + VBROADCASTI128 (DX), Y7 // high high table + VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) + + SHRQ $6, R9 // len(in) / 64 + MOVQ out+72(FP), DX // DX: &out + MOVQ in+48(FP), SI // SI: &in + TESTQ R9, R9 + JZ done_xor_avx2_64 + +loopback_xor_avx2_64: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y10 + VMOVDQU (DX), Y4 + VMOVDQU 32(DX), Y14 + VPSRLQ $4, Y0, Y1 // Y1: high input + VPSRLQ $4, Y10, Y11 // Y11: high input 2 + VPAND Y8, Y0, Y0 // Y0: low input + VPAND Y8, Y10, Y10 // Y10: low input 2 + VPAND Y8, Y1, Y1 // Y11: high input + VPAND Y8, Y11, Y11 // Y11: high input 2 + VPSHUFB Y0, Y6, Y2 // Y2: mul low part + VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2 + VPSHUFB Y1, Y7, Y3 // Y3: mul high part + VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2 + VPXOR Y3, Y2, Y3 // Y3: Result + VPXOR Y13, Y12, Y13 // Y13: Result 2 + VPXOR Y4, Y3, Y4 // Y4: Result + VPXOR Y14, Y13, Y14 // Y4: Result 2 + VMOVDQU Y4, (DX) + VMOVDQU Y14, 32(DX) + + ADDQ $64, SI // in+=64 + ADDQ $64, DX // out+=64 + SUBQ $1, R9 + JNZ loopback_xor_avx2_64 + +done_xor_avx2_64: + VZEROUPPER + RET + +// func galMulAVX2_64(low, high, in, out []byte) +TEXT ·galMulAVX2_64(SB), 7, $0 + MOVQ low+0(FP), SI // SI: &low + MOVQ high+24(FP), DX // DX: &high + MOVQ $15, BX // BX: low mask + MOVQ BX, X5 + MOVQ in_len+56(FP), R9 // R9: len(in) + VBROADCASTI128 (SI), Y6 // low table + VBROADCASTI128 (DX), Y7 // high high table + VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) + + SHRQ $6, R9 // len(in) / 64 + MOVQ out+72(FP), DX // DX: &out + MOVQ in+48(FP), SI // SI: &in + TESTQ R9, R9 + JZ done_avx2_64 + +loopback_avx2_64: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y10 + VPSRLQ $4, Y0, Y1 // Y1: high input + VPSRLQ $4, Y10, Y11 // Y11: high input 2 + VPAND Y8, Y0, Y0 // Y0: low input + VPAND Y8, Y10, Y10 // Y10: low input + VPAND Y8, Y1, Y1 // Y1: high input + VPAND Y8, Y11, Y11 // Y11: high input 2 + VPSHUFB Y0, Y6, Y2 // Y2: mul low part + VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2 + VPSHUFB Y1, Y7, Y3 // Y3: mul high part + VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2 + VPXOR Y3, Y2, Y4 // Y4: Result + VPXOR Y13, Y12, Y14 // Y14: Result 2 + VMOVDQU Y4, (DX) + VMOVDQU Y14, 32(DX) + + ADDQ $64, SI // in+=64 + ADDQ $64, DX // out+=64 + SUBQ $1, R9 + JNZ loopback_avx2_64 + +done_avx2_64: + VZEROUPPER + RET diff --git a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go new file mode 100644 index 000000000..8ef402bf0 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go @@ -0,0 +1,130 @@ +//go:build !noasm && !appengine && !gccgo && !nopshufb + +// Copyright 2015, Klaus Post, see LICENSE for details. +// Copyright 2017, Minio, Inc. + +package reedsolomon + +const pshufb = true + +//go:noescape +func galMulNEON(low, high, in, out []byte) + +//go:noescape +func galMulXorNEON(low, high, in, out []byte) + +func galMulSlice(c byte, in, out []byte, o *options) { + if c == 1 { + copy(out, in) + return + } + var done int + galMulNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done = (len(in) >> 5) << 5 + + remain := len(in) - done + if remain > 0 { + mt := mulTable[c][:256] + for i := done; i < len(in); i++ { + out[i] = mt[in[i]] + } + } +} + +func galMulSliceXor(c byte, in, out []byte, o *options) { + if c == 1 { + sliceXor(in, out, o) + return + } + var done int + galMulXorNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done = (len(in) >> 5) << 5 + + remain := len(in) - done + if remain > 0 { + mt := mulTable[c][:256] + for i := done; i < len(in); i++ { + out[i] ^= mt[in[i]] + } + } +} + +// 4-way butterfly +func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 2-way butterfly forward +func fftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + refMulAdd(x, y, log_m) + // 64 byte aligned, always full. + xorSliceNEON(x, y) +} + +// 2-way butterfly forward +func fftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + mulAdd8(x, y, log_m, o) + sliceXor(x, y, o) +} + +// 2-way butterfly +func ifftDIT2(x, y []byte, log_m ffe, o *options) { + // 64 byte aligned, always full. + xorSliceNEON(x, y) + // Reference version: + refMulAdd(x, y, log_m) +} + +// 2-way butterfly inverse +func ifftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + sliceXor(x, y, o) + mulAdd8(x, y, log_m, o) +} + +func mulgf16(x, y []byte, log_m ffe, o *options) { + refMul(x, y, log_m) +} + +func mulAdd8(out, in []byte, log_m ffe8, o *options) { + t := &multiply256LUT8[log_m] + galMulXorNEON(t[:16], t[16:32], in, out) + done := (len(in) >> 5) << 5 + in = in[done:] + if len(in) > 0 { + out = out[done:] + refMulAdd8(in, out, log_m) + } +} + +func mulgf8(out, in []byte, log_m ffe8, o *options) { + var done int + t := &multiply256LUT8[log_m] + galMulNEON(t[:16], t[16:32], in, out) + done = (len(in) >> 5) << 5 + + remain := len(in) - done + if remain > 0 { + mt := mul8LUTs[log_m].Value[:] + for i := done; i < len(in); i++ { + out[i] ^= byte(mt[in[i]]) + } + } +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_arm64.s b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s new file mode 100644 index 000000000..772dfac96 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s @@ -0,0 +1,102 @@ +//+build !noasm +//+build !appengine +//+build !gccgo +//+build !nopshufb + +// Copyright 2015, Klaus Post, see LICENSE for details. +// Copyright 2017, Minio, Inc. + +#define LOAD(LO1, LO2, HI1, HI2) \ + VLD1.P 32(R1), [LO1.B16, LO2.B16] \ + \ + \ // Get low input and high input + VUSHR $4, LO1.B16, HI1.B16 \ + VUSHR $4, LO2.B16, HI2.B16 \ + VAND V8.B16, LO1.B16, LO1.B16 \ + VAND V8.B16, LO2.B16, LO2.B16 + +#define GALOIS_MUL(MUL_LO, MUL_HI, OUT1, OUT2, TMP1, TMP2) \ + \ // Mul low part and mul high part + VTBL V0.B16, [MUL_LO.B16], OUT1.B16 \ + VTBL V10.B16, [MUL_HI.B16], OUT2.B16 \ + VTBL V1.B16, [MUL_LO.B16], TMP1.B16 \ + VTBL V11.B16, [MUL_HI.B16], TMP2.B16 \ + \ + \ // Combine results + VEOR OUT2.B16, OUT1.B16, OUT1.B16 \ + VEOR TMP2.B16, TMP1.B16, OUT2.B16 + +// func galMulNEON(low, high, in, out []byte) +TEXT ·galMulNEON(SB), 7, $0 + MOVD in_base+48(FP), R1 + MOVD in_len+56(FP), R2 // length of message + MOVD out_base+72(FP), R5 + SUBS $32, R2 + BMI complete + + MOVD low+0(FP), R10 // R10: &low + MOVD high+24(FP), R11 // R11: &high + VLD1 (R10), [V6.B16] + VLD1 (R11), [V7.B16] + + // + // Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error + // WORD $0x4e010c68 // dup v8.16b, w3 + // + MOVD $0x0f, R3 + VMOV R3, V8.B[0] + VDUP V8.B[0], V8.B16 + +loop: + // Main loop + LOAD(V0, V1, V10, V11) + GALOIS_MUL(V6, V7, V4, V5, V14, V15) + + // Store result + VST1.P [V4.D2, V5.D2], 32(R5) + + SUBS $32, R2 + BPL loop + +complete: + RET + +// func galMulXorNEON(low, high, in, out []byte) +TEXT ·galMulXorNEON(SB), 7, $0 + MOVD in_base+48(FP), R1 + MOVD in_len+56(FP), R2 // length of message + MOVD out_base+72(FP), R5 + SUBS $32, R2 + BMI completeXor + + MOVD low+0(FP), R10 // R10: &low + MOVD high+24(FP), R11 // R11: &high + VLD1 (R10), [V6.B16] + VLD1 (R11), [V7.B16] + + // + // Use an extra instruction below since `VDUP R3, V8.B16` generates assembler error + // WORD $0x4e010c68 // dup v8.16b, w3 + // + MOVD $0x0f, R3 + VMOV R3, V8.B[0] + VDUP V8.B[0], V8.B16 + +loopXor: + // Main loop + VLD1 (R5), [V20.B16, V21.B16] + + LOAD(V0, V1, V10, V11) + GALOIS_MUL(V6, V7, V4, V5, V14, V15) + + VEOR V20.B16, V4.B16, V4.B16 + VEOR V21.B16, V5.B16, V5.B16 + + // Store result + VST1.P [V4.D2, V5.D2], 32(R5) + + SUBS $32, R2 + BPL loopXor + +completeXor: + RET diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go new file mode 100644 index 000000000..dac9b1363 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go @@ -0,0 +1,3532 @@ +// Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. + +//go:build !appengine && !noasm && !nogen && !nopshufb && gc + +package reedsolomon + +func _dummy_() + +//go:noescape +func sSE2XorSlice(in []byte, out []byte) + +//go:noescape +func sSE2XorSlice_64(in []byte, out []byte) + +//go:noescape +func avx2XorSlice_64(in []byte, out []byte) + +// mulAvxTwo_1x1_64 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x1_64 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x1 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x1Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x1_64Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x2_64 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x2_64 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x2 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x2Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x2_64Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x3_64 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x3_64 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x3 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x3Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x3_64Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x4 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x4_64 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x4 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x4Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x4Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x5 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x5_64 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x5 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x5Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x5Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x6 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x6_64 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x6 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x6Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x6Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x7 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x7_64 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x7 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x7Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x7Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x8 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x8_64 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x8 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x8Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x8Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x9 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x9_64 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x9 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x9Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x9Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x10 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x10_64 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x10 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x10Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x10Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x1_64 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x1_64 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x1 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x1Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x1_64Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x2_64 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x2_64 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x2 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x2Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x2_64Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x3_64 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x3_64 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x3 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x3Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x3_64Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x4 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x4_64 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x4 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x4Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x4Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x5 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x5_64 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x5 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x5Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x5Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x6 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x6_64 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x6 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x6Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x6Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x7 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x7_64 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x7 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x7Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x7Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x8 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x8_64 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x8 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x8Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x8Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x9 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x9_64 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x9 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x9Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x9Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x10 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x10_64 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x10 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x10Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x10Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x1_64 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x1_64 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x1 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x1Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x1_64Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x2_64 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x2_64 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x2 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x2Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x2_64Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x3_64 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x3_64 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x3 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x3Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x3_64Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x4 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x4_64 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x4 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x4Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x4Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x5 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x5_64 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x5 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x5Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x5Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x6 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x6_64 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x6 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x6Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x6Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x7 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x7_64 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x7 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x7Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x7Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x8 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x8_64 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x8 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x8Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x8Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x9 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x9_64 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x9 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x9Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x9Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x10 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x10_64 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x10 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x10Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x10Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x1_64 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x1_64 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x1 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x1Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x1_64Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x2_64 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x2_64 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x2 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x2Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x2_64Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x3_64 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x3_64 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x3 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x3Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x3_64Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x4 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x4_64 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x4 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x4Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x4Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x5 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x5_64 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x5 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x5Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x5Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x6 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x6_64 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x6 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x6Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x6Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x7 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x7_64 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x7 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x7Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x7Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x8 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x8_64 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x8 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x8Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x8Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x9 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x9_64 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x9 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x9Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x9Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x10 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x10_64 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x10 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x10Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x10Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x1_64 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x1_64 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x1 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x1Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x1_64Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x2_64 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x2_64 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x2 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x2Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x2_64Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x3_64 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x3_64 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x3 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x3Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x3_64Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x4 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x4_64 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x4 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x4Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x4Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x5 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x5_64 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x5 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x5Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x5Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x6 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x6_64 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x6 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x6Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x6Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x7 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x7_64 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x7 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x7Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x7Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x8 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x8_64 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x8 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x8Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x8Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x9 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x9_64 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x9 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x9Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x9Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x10 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x10_64 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x10 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x10Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x10Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x1_64 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x1_64 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x1 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x1Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x1_64Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x2_64 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x2_64 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x2 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x2Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x2_64Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x3_64 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x3_64 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x3 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x3Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x3_64Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x4 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x4_64 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x4 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x4Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x4Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x5 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x5_64 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x5 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x5Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x5Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x6 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x6_64 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x6 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x6Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x6Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x7 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x7_64 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x7 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x7Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x7Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x8 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x8_64 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x8 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x8Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x8Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x9 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x9_64 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x9 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x9Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x9Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x10 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x10_64 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x10 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x10Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x10Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x1_64 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x1_64 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x1 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x1Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x1_64Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x2_64 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x2_64 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x2 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x2Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x2_64Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x3_64 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x3_64 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x3 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x3Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x3_64Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x4 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x4_64 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x4 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x4Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x4Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x5 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x5_64 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x5 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x5Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x5Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x6 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x6_64 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x6 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x6Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x6Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x7 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x7_64 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x7 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x7Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x7Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x8 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x8_64 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x8 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x8Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x8Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x9 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x9_64 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x9 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x9Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x9Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x10 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x10_64 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x10 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x10Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x10Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x1_64 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x1_64 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x1 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x1Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x1_64Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x2_64 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x2_64 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x2 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x2Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x2_64Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x3_64 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x3_64 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x3 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x3Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x3_64Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x4 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x4_64 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x4 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x4Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x4Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x5 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x5_64 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x5 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x5Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x5Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x6 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x6_64 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x6 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x6Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x6Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x7 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x7_64 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x7 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x7Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x7Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x8 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x8_64 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x8 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x8Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x8Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x9 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x9_64 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x9 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x9Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x9Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x10 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x10_64 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x10 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x10Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x10Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x1_64 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x1_64 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x1 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x1Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x1_64Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x2_64 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x2_64 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x2 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x2Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x2_64Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x3_64 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x3_64 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x3 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x3Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x3_64Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x4 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x4_64 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x4 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x4Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x4Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x5 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x5_64 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x5 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x5Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x5Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x6 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x6_64 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x6 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x6Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x6Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x7 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x7_64 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x7 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x7Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x7Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x8 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x8_64 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x8 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x8Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x8Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x9 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x9_64 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x9 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x9Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x9Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x10 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x10_64 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x10 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x10Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x10Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x1_64 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x1_64 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x1 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x1Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x1_64Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x2_64 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x2_64 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x2 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x2Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x2_64Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x3_64 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x3_64 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x3 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x3Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x3_64Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x4 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x4_64 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x4 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x4Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x4Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x5 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x5_64 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x5 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x5Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x5Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x6 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x6_64 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x6 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x6Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x6Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x7 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x7_64 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x7 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x7Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x7Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x8 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x8_64 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x8 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x8Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x8Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x9 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x9_64 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x9 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x9Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x9Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x10 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x10_64 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x10 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x10Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x10Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func ifftDIT2_avx2(x []byte, y []byte, table *[128]uint8) + +//go:noescape +func fftDIT2_avx2(x []byte, y []byte, table *[128]uint8) + +//go:noescape +func mulgf16_avx2(x []byte, y []byte, table *[128]uint8) + +//go:noescape +func ifftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func fftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) + +//go:noescape +func ifftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) + +//go:noescape +func fftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) + +//go:noescape +func mulgf16_ssse3(x []byte, y []byte, table *[128]uint8) + +//go:noescape +func ifftDIT28_avx2(x []byte, y []byte, table *[32]uint8) + +//go:noescape +func fftDIT28_avx2(x []byte, y []byte, table *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func fftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) + +//go:noescape +func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s new file mode 100644 index 000000000..ad253a65a --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s @@ -0,0 +1,128293 @@ +// Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. + +//go:build !appengine && !noasm && !nogen && !nopshufb && gc + +#include "textflag.h" + +// func _dummy_() +TEXT ·_dummy_(SB), $0 +#ifdef GOAMD64_v4 +#define XOR3WAY(ignore, a, b, dst) \ + VPTERNLOGD $0x96, a, b, dst + +#else +#define XOR3WAY(ignore, a, b, dst) \ + VPXOR a, dst, dst \ + VPXOR b, dst, dst + +#endif + RET + +// sSE2XorSlice will XOR in with out and store in out. +// Processes 16 bytes/loop. + +// func sSE2XorSlice(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x04, DX + JZ end + +loop: + MOVOU (AX), X0 + MOVOU (CX), X1 + PXOR X0, X1 + MOVOU X1, (CX) + ADDQ $0x10, AX + ADDQ $0x10, CX + DECQ DX + JNZ loop + +end: + RET + +// sSE2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func sSE2XorSlice_64(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end + +loop: + MOVOU (AX), X0 + MOVOU 16(AX), X2 + MOVOU 32(AX), X4 + MOVOU 48(AX), X6 + MOVOU (CX), X1 + MOVOU 16(CX), X3 + MOVOU 32(CX), X5 + MOVOU 48(CX), X7 + PXOR X0, X1 + PXOR X2, X3 + PXOR X4, X5 + PXOR X6, X7 + MOVOU X1, (CX) + MOVOU X3, 16(CX) + MOVOU X5, 32(CX) + MOVOU X7, 48(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop + +end: + RET + +// avx2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func avx2XorSlice_64(in []byte, out []byte) +// Requires: AVX, AVX2 +TEXT ·avx2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end + +loop: + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y2 + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y3 + VPXOR Y0, Y1, Y1 + VPXOR Y2, Y3, Y3 + VMOVDQU Y1, (CX) + VMOVDQU Y3, 32(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop + +end: + VZEROUPPER + RET + +// func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x1_64_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + MOVQ $0x0000000f, BX + MOVQ BX, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + ADDQ $0x40, CX + VPSRLQ $0x04, Y2, Y6 + VPSRLQ $0x04, Y3, Y5 + VPAND Y4, Y2, Y2 + VPAND Y4, Y3, Y3 + VPAND Y4, Y6, Y6 + VPAND Y4, Y5, Y5 + VPSHUFB Y2, Y0, Y2 + VPSHUFB Y3, Y0, Y3 + VPSHUFB Y6, Y1, Y6 + VPSHUFB Y5, Y1, Y5 + VPXOR Y2, Y6, Y2 + VPXOR Y3, Y5, Y3 + + // Store 1 outputs + VMOVDQU Y2, (DX) + VMOVDQU Y3, 32(DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x1_64_loop + VZEROUPPER + +mulAvxTwo_1x1_64_end: + RET + +// func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x1_64_end + VBROADCASTF32X2 (CX), Z0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulGFNI_1x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (CX), Z1 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z1, Z1 + + // Store 1 outputs + VMOVDQU64 Z1, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x1_64_loop + VZEROUPPER + +mulGFNI_1x1_64_end: + RET + +// func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvxGFNI_1x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y1 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x1_loop + VZEROUPPER + +mulAvxGFNI_1x1_end: + RET + +// func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulGFNI_1x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (DX), Z1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (CX), Z2 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z2, Z2 + VXORPD Z1, Z2, Z1 + + // Store 1 outputs + VMOVDQU64 Z1, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x1_64Xor_loop + VZEROUPPER + +mulGFNI_1x1_64Xor_end: + RET + +// func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1Xor_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvxGFNI_1x1Xor_loop: + // Load 1 outputs + VMOVDQU (DX), Y1 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y2 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y2, Y2 + VXORPD Y1, Y2, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x1Xor_loop + VZEROUPPER + +mulAvxGFNI_1x1Xor_end: + RET + +// func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_1x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x1_64Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + MOVQ $0x0000000f, BX + MOVQ BX, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (DX), Y2 + VMOVDQU 32(DX), Y3 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y7 + ADDQ $0x40, CX + VPSRLQ $0x04, Y5, Y6 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y5, Y5 + VPAND Y4, Y7, Y7 + VPAND Y4, Y6, Y6 + VPAND Y4, Y8, Y8 + VPSHUFB Y5, Y0, Y5 + VPSHUFB Y7, Y0, Y7 + VPSHUFB Y6, Y1, Y6 + VPSHUFB Y8, Y1, Y8 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 1 outputs + VMOVDQU Y2, (DX) + VMOVDQU Y3, 32(DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_1x1_64Xor_end: + RET + +// func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + MOVQ $0x0000000f, DI + MOVQ DI, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y9 + ADDQ $0x40, DX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y4, Y7, Y7 + VPAND Y4, Y9, Y9 + VPAND Y4, Y8, Y8 + VPAND Y4, Y10, Y10 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y6 + VPSHUFB Y9, Y2, Y3 + VPSHUFB Y7, Y2, Y2 + VPSHUFB Y10, Y6, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y2, Y6, Y0 + VPXOR Y3, Y5, Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y6 + VPSHUFB Y9, Y2, Y3 + VPSHUFB Y7, Y2, Y2 + VPSHUFB Y10, Y6, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y2, Y6, Y2 + VPXOR Y3, Y5, Y3 + + // Store 2 outputs + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (BX) + VMOVDQU Y3, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x2_64_loop + VZEROUPPER + +mulAvxTwo_1x2_64_end: + RET + +// func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulGFNI_1x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z2 + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + + // Store 2 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z3, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x2_64_loop + VZEROUPPER + +mulGFNI_1x2_64_end: + RET + +// func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvxGFNI_1x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x2_loop + VZEROUPPER + +mulAvxGFNI_1x2_end: + RET + +// func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulGFNI_1x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (BX), Z2 + VMOVDQU64 (DX), Z3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z5 + VXORPD Z3, Z5, Z3 + + // Store 2 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z3, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x2_64Xor_loop + VZEROUPPER + +mulGFNI_1x2_64Xor_end: + RET + +// func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvxGFNI_1x2Xor_loop: + // Load 2 outputs + VMOVDQU (BX), Y2 + VMOVDQU (DX), Y3 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y5 + VXORPD Y2, Y5, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y4, Y5 + VXORPD Y3, Y5, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x2Xor_loop + VZEROUPPER + +mulAvxGFNI_1x2Xor_end: + RET + +// func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_1x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + MOVQ $0x0000000f, DI + MOVQ DI, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (BX), Y2 + VMOVDQU 32(BX), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (BX) + VMOVDQU Y3, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_1x2_64Xor_end: + RET + +// func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x3_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_1x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y6, Y9, Y9 + VPAND Y6, Y11, Y11 + VPAND Y6, Y10, Y10 + VPAND Y6, Y12, Y12 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y8 + VPSHUFB Y11, Y4, Y5 + VPSHUFB Y9, Y4, Y4 + VPSHUFB Y12, Y8, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y4, Y8, Y0 + VPXOR Y5, Y7, Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y8 + VPSHUFB Y11, Y4, Y5 + VPSHUFB Y9, Y4, Y4 + VPSHUFB Y12, Y8, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y4, Y8, Y2 + VPXOR Y5, Y7, Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y8 + VPSHUFB Y11, Y4, Y5 + VPSHUFB Y9, Y4, Y4 + VPSHUFB Y12, Y8, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y4, Y8, Y4 + VPXOR Y5, Y7, Y5 + + // Store 3 outputs + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x3_64_loop + VZEROUPPER + +mulAvxTwo_1x3_64_end: + RET + +// func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulGFNI_1x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z3 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z4 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + + // Store 3 outputs + VMOVDQU64 Z3, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x3_64_loop + VZEROUPPER + +mulGFNI_1x3_64_end: + RET + +// func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvxGFNI_1x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y5, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x3_loop + VZEROUPPER + +mulAvxGFNI_1x3_end: + RET + +// func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulGFNI_1x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (BX), Z3 + VMOVDQU64 (SI), Z4 + VMOVDQU64 (DX), Z5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z7 + VXORPD Z3, Z7, Z3 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 3 outputs + VMOVDQU64 Z3, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x3_64Xor_loop + VZEROUPPER + +mulGFNI_1x3_64Xor_end: + RET + +// func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvxGFNI_1x3Xor_loop: + // Load 3 outputs + VMOVDQU (BX), Y3 + VMOVDQU (SI), Y4 + VMOVDQU (DX), Y5 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y3, Y7, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x3Xor_loop + VZEROUPPER + +mulAvxGFNI_1x3Xor_end: + RET + +// func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_1x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_1x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + VMOVDQU (BX), Y4 + VMOVDQU 32(BX), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_1x3_64Xor_end: + RET + +// func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x4(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y4, Y6, Y6 + VPAND Y4, Y7, Y7 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y7, Y5, Y5 + VPXOR Y3, Y5, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y7, Y5, Y5 + VPXOR Y3, Y5, Y1 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y7, Y5, Y5 + VPXOR Y3, Y5, Y2 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y7, Y5, Y5 + VPXOR Y3, Y5, Y3 + + // Store 4 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x4_loop + VZEROUPPER + +mulAvxTwo_1x4_end: + RET + +// func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulGFNI_1x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z7, Z5 + VGF2P8AFFINEQB $0x00, Z2, Z7, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + + // Store 4 outputs + VMOVDQU64 Z4, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z5, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x4_64_loop + VZEROUPPER + +mulGFNI_1x4_64_end: + RET + +// func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y7, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y7, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4_loop + VZEROUPPER + +mulAvxGFNI_1x4_end: + RET + +// func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulGFNI_1x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (BX), Z4 + VMOVDQU64 (SI), Z5 + VMOVDQU64 (DI), Z6 + VMOVDQU64 (DX), Z7 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z9 + VXORPD Z4, Z9, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z9 + VXORPD Z5, Z9, Z5 + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 4 outputs + VMOVDQU64 Z4, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z5, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x4_64Xor_loop + VZEROUPPER + +mulGFNI_1x4_64Xor_end: + RET + +// func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4Xor_loop: + // Load 4 outputs + VMOVDQU (BX), Y4 + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (DX), Y7 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y4, Y9, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y5, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4Xor_loop + VZEROUPPER + +mulAvxGFNI_1x4Xor_end: + RET + +// func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_1x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (SI), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU (DI), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU (R8), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (BX), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x4Xor_loop + VZEROUPPER + +mulAvxTwo_1x4Xor_end: + RET + +// func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_1x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y5, Y7, Y7 + VPAND Y5, Y8, Y8 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y2 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y3 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y4 + + // Store 5 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x5_loop + VZEROUPPER + +mulAvxTwo_1x5_end: + RET + +// func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulGFNI_1x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z5 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z7 + VGF2P8AFFINEQB $0x00, Z3, Z9, Z8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + + // Store 5 outputs + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x5_64_loop + VZEROUPPER + +mulGFNI_1x5_64_end: + RET + +// func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y9, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5_loop + VZEROUPPER + +mulAvxGFNI_1x5_end: + RET + +// func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulGFNI_1x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (BX), Z5 + VMOVDQU64 (SI), Z6 + VMOVDQU64 (DI), Z7 + VMOVDQU64 (R8), Z8 + VMOVDQU64 (DX), Z9 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z11 + VXORPD Z5, Z11, Z5 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z11 + VXORPD Z6, Z11, Z6 + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z7, Z11, Z7 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 5 outputs + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x5_64Xor_loop + VZEROUPPER + +mulGFNI_1x5_64Xor_end: + RET + +// func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5Xor_loop: + // Load 5 outputs + VMOVDQU (BX), Y5 + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (DX), Y9 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y5, Y11, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y6, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y7, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5Xor_loop + VZEROUPPER + +mulAvxGFNI_1x5Xor_end: + RET + +// func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_1x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_1x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (SI), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU (DI), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU (R8), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU (R9), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU (BX), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x5Xor_loop + VZEROUPPER + +mulAvxTwo_1x5Xor_end: + RET + +// func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x6(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_1x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y9, Y9 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y3 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y4 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y5 + + // Store 6 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x6_loop + VZEROUPPER + +mulAvxTwo_1x6_end: + RET + +// func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulGFNI_1x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z11, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z11, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + + // Store 6 outputs + VMOVDQU64 Z6, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z9, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x6_64_loop + VZEROUPPER + +mulGFNI_1x6_64_end: + RET + +// func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y11, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6_loop + VZEROUPPER + +mulAvxGFNI_1x6_end: + RET + +// func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulGFNI_1x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (BX), Z6 + VMOVDQU64 (SI), Z7 + VMOVDQU64 (DI), Z8 + VMOVDQU64 (R8), Z9 + VMOVDQU64 (R9), Z10 + VMOVDQU64 (DX), Z11 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z6, Z13, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z7, Z13, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 6 outputs + VMOVDQU64 Z6, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z9, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x6_64Xor_loop + VZEROUPPER + +mulGFNI_1x6_64Xor_end: + RET + +// func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6Xor_loop: + // Load 6 outputs + VMOVDQU (BX), Y6 + VMOVDQU (SI), Y7 + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (DX), Y11 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y6, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y7, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6Xor_loop + VZEROUPPER + +mulAvxGFNI_1x6Xor_end: + RET + +// func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_1x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_1x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (SI), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU (DI), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU (R8), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU (R9), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU (R10), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU (BX), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x6Xor_loop + VZEROUPPER + +mulAvxTwo_1x6Xor_end: + RET + +// func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x7(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_1x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y7, Y9, Y9 + VPAND Y7, Y10, Y10 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y4 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y5 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y6 + + // Store 7 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x7_loop + VZEROUPPER + +mulAvxTwo_1x7_end: + RET + +// func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulGFNI_1x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (CX), Z13 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z13, Z7 + VGF2P8AFFINEQB $0x00, Z1, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z2, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z3, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z4, Z13, Z11 + VGF2P8AFFINEQB $0x00, Z5, Z13, Z12 + VGF2P8AFFINEQB $0x00, Z6, Z13, Z13 + + // Store 7 outputs + VMOVDQU64 Z7, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x7_64_loop + VZEROUPPER + +mulGFNI_1x7_64_end: + RET + +// func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y13 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y13, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7_loop + VZEROUPPER + +mulAvxGFNI_1x7_end: + RET + +// func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulGFNI_1x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (BX), Z7 + VMOVDQU64 (SI), Z8 + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (R9), Z11 + VMOVDQU64 (R10), Z12 + VMOVDQU64 (DX), Z13 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z15 + VXORPD Z7, Z15, Z7 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z15 + VXORPD Z8, Z15, Z8 + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z9, Z15, Z9 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z10, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z11, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 7 outputs + VMOVDQU64 Z7, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x7_64Xor_loop + VZEROUPPER + +mulGFNI_1x7_64Xor_end: + RET + +// func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7Xor_loop: + // Load 7 outputs + VMOVDQU (BX), Y7 + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DX), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7Xor_loop + VZEROUPPER + +mulAvxGFNI_1x7Xor_end: + RET + +// func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_1x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_1x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (SI), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU (DI), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU (R8), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU (R9), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU (R10), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU (R11), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU (BX), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x7Xor_loop + VZEROUPPER + +mulAvxTwo_1x7Xor_end: + RET + +// func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x8(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_1x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y8, Y10, Y10 + VPAND Y8, Y11, Y11 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y5 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y6 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y7 + + // Store 8 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x8_loop + VZEROUPPER + +mulAvxTwo_1x8_end: + RET + +// func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x8_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DX + + // Add start offset to input + ADDQ R12, CX + +mulGFNI_1x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z15 + + // Store 8 outputs + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z10, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z12, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z13, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x8_64_loop + VZEROUPPER + +mulGFNI_1x8_64_end: + RET + +// func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 56(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8_loop + VZEROUPPER + +mulAvxGFNI_1x8_end: + RET + +// func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x8_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DX + + // Add start offset to input + ADDQ R12, CX + +mulGFNI_1x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (BX), Z8 + VMOVDQU64 (SI), Z9 + VMOVDQU64 (DI), Z10 + VMOVDQU64 (R8), Z11 + VMOVDQU64 (R9), Z12 + VMOVDQU64 (R10), Z13 + VMOVDQU64 (R11), Z14 + VMOVDQU64 (DX), Z15 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z8, Z17, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z9, Z17, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z10, Z17, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z11, Z17, Z11 + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 8 outputs + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z10, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z12, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z13, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x8_64Xor_loop + VZEROUPPER + +mulGFNI_1x8_64Xor_end: + RET + +// func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8Xor(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8Xor_loop: + // Load 8 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8Xor_loop + VZEROUPPER + +mulAvxGFNI_1x8Xor_end: + RET + +// func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_1x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_1x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (SI), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU (DI), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU (R8), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU (R9), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU (R10), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU (R11), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU (R12), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU (BX), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x8Xor_loop + VZEROUPPER + +mulAvxTwo_1x8Xor_end: + RET + +// func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x9(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_1x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y9, Y11, Y11 + VPAND Y9, Y12, Y12 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y6 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y7 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y8 + + // Store 9 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (R13) + ADDQ $0x20, R13 + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x9_loop + VZEROUPPER + +mulAvxTwo_1x9_end: + RET + +// func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x9_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DX + + // Add start offset to input + ADDQ R13, CX + +mulGFNI_1x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (CX), Z17 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z17, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z17, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z17, Z11 + VGF2P8AFFINEQB $0x00, Z3, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z6, Z17, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z17, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z17, Z17 + + // Store 9 outputs + VMOVDQU64 Z9, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z15, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x9_64_loop + VZEROUPPER + +mulGFNI_1x9_64_end: + RET + +// func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvxGFNI_1x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 64(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x9_loop + VZEROUPPER + +mulAvxGFNI_1x9_end: + RET + +// func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x9_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DX + + // Add start offset to input + ADDQ R13, CX + +mulGFNI_1x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (BX), Z9 + VMOVDQU64 (SI), Z10 + VMOVDQU64 (DI), Z11 + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (R10), Z14 + VMOVDQU64 (R11), Z15 + VMOVDQU64 (R12), Z16 + VMOVDQU64 (DX), Z17 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z9, Z19, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z10, Z19, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z11, Z19, Z11 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 9 outputs + VMOVDQU64 Z9, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z15, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x9_64Xor_loop + VZEROUPPER + +mulGFNI_1x9_64Xor_end: + RET + +// func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9Xor(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvxGFNI_1x9Xor_loop: + // Load 9 outputs + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x9Xor_loop + VZEROUPPER + +mulAvxGFNI_1x9Xor_end: + RET + +// func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_1x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_1x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (SI), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU (DI), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU (R8), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU (R9), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU (R10), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU (R11), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU (R12), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU (R13), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU (BX), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (R13) + ADDQ $0x20, R13 + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x9Xor_loop + VZEROUPPER + +mulAvxTwo_1x9Xor_end: + RET + +// func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_1x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y10, Y12, Y12 + VPAND Y10, Y13, Y13 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y7 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y8 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y9 + + // Store 10 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (R13) + ADDQ $0x20, R13 + VMOVDQU Y8, (R14) + ADDQ $0x20, R14 + VMOVDQU Y9, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x10_loop + VZEROUPPER + +mulAvxTwo_1x10_end: + RET + +// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x10_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z19 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z19, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z19, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z19, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z19, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z19, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64_loop + VZEROUPPER + +mulGFNI_1x10_64_end: + RET + +// func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y13, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y13, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 72(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10_loop + VZEROUPPER + +mulAvxGFNI_1x10_end: + RET + +// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (BX), Z10 + VMOVDQU64 (SI), Z11 + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (R10), Z15 + VMOVDQU64 (R11), Z16 + VMOVDQU64 (R12), Z17 + VMOVDQU64 (R13), Z18 + VMOVDQU64 (DX), Z19 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z10, Z21, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z11, Z21, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z12, Z21, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z13, Z21, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z14, Z21, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64Xor_loop + VZEROUPPER + +mulGFNI_1x10_64Xor_end: + RET + +// func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10Xor(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10Xor_loop: + // Load 10 outputs + VMOVDQU (SI), Y4 + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10Xor_loop + VZEROUPPER + +mulAvxGFNI_1x10Xor_end: + RET + +// func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_1x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_1x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (SI), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU (DI), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU (R8), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU (R9), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU (R10), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU (R11), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU (R12), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU (R13), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU (R14), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU (BX), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (R13) + ADDQ $0x20, R13 + VMOVDQU Y8, (R14) + ADDQ $0x20, R14 + VMOVDQU Y9, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x10Xor_loop + VZEROUPPER + +mulAvxTwo_1x10Xor_end: + RET + +// func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x1_64_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + MOVQ $0x0000000f, SI + MOVQ SI, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_2x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y9 + ADDQ $0x40, DX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y0, Y7 + VPSHUFB Y9, Y0, Y9 + VPSHUFB Y8, Y1, Y8 + VPSHUFB Y10, Y1, Y10 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y9 + ADDQ $0x40, CX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y2, Y7 + VPSHUFB Y9, Y2, Y9 + VPSHUFB Y8, Y3, Y8 + VPSHUFB Y10, Y3, Y10 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 1 outputs + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x1_64_loop + VZEROUPPER + +mulAvxTwo_2x1_64_end: + RET + +// func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulGFNI_2x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z2 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Store 1 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x1_64_loop + VZEROUPPER + +mulGFNI_2x1_64_end: + RET + +// func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x1_loop + VZEROUPPER + +mulAvxGFNI_2x1_end: + RET + +// func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulGFNI_2x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (BX), Z2 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Store 1 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x1_64Xor_loop + VZEROUPPER + +mulGFNI_2x1_64Xor_end: + RET + +// func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1Xor_loop: + // Load 1 outputs + VMOVDQU (BX), Y2 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x1Xor_loop + VZEROUPPER + +mulAvxGFNI_2x1Xor_end: + RET + +// func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x1_64Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + MOVQ $0x0000000f, SI + MOVQ SI, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_2x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (BX), Y4 + VMOVDQU 32(BX), Y5 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y9 + ADDQ $0x40, DX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y0, Y7 + VPSHUFB Y9, Y0, Y9 + VPSHUFB Y8, Y1, Y8 + VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y9 + ADDQ $0x40, CX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y2, Y7 + VPSHUFB Y9, Y2, Y9 + VPSHUFB Y8, Y3, Y8 + VPSHUFB Y10, Y3, Y10 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 1 outputs + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_2x1_64Xor_end: + RET + +// func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 25 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, BX + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_2x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (SI) + VMOVDQU Y3, 32(SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x2_64_loop + VZEROUPPER + +mulAvxTwo_2x2_64_end: + RET + +// func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulGFNI_2x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z5 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z3, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 2 outputs + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x2_64_loop + VZEROUPPER + +mulGFNI_2x2_64_end: + RET + +// func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x2_loop + VZEROUPPER + +mulAvxGFNI_2x2_end: + RET + +// func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulGFNI_2x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (SI), Z4 + VMOVDQU64 (BX), Z5 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z3, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 2 outputs + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x2_64Xor_loop + VZEROUPPER + +mulGFNI_2x2_64Xor_end: + RET + +// func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2Xor_loop: + // Load 2 outputs + VMOVDQU (SI), Y4 + VMOVDQU (BX), Y5 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x2Xor_loop + VZEROUPPER + +mulAvxGFNI_2x2Xor_end: + RET + +// func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 25 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, BX + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_2x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + VMOVDQU (SI), Y2 + VMOVDQU 32(SI), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (SI) + VMOVDQU Y3, 32(SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_2x2_64Xor_end: + RET + +// func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x3_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_2x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y4, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x3_64_loop + VZEROUPPER + +mulAvxTwo_2x3_64_end: + RET + +// func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulGFNI_2x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z8 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Store 3 outputs + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x3_64_loop + VZEROUPPER + +mulGFNI_2x3_64_end: + RET + +// func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x3_loop + VZEROUPPER + +mulAvxGFNI_2x3_end: + RET + +// func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulGFNI_2x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (SI), Z6 + VMOVDQU64 (DI), Z7 + VMOVDQU64 (BX), Z8 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Store 3 outputs + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x3_64Xor_loop + VZEROUPPER + +mulGFNI_2x3_64Xor_end: + RET + +// func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3Xor_loop: + // Load 3 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (BX), Y8 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x3Xor_loop + VZEROUPPER + +mulAvxGFNI_2x3Xor_end: + RET + +// func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_2x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + VMOVDQU (R8), Y2 + VMOVDQU 32(R8), Y3 + VMOVDQU (SI), Y4 + VMOVDQU 32(SI), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y4, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_2x3_64Xor_end: + RET + +// func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x4(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 25 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_2x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x4_loop + VZEROUPPER + +mulAvxTwo_2x4_end: + RET + +// func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulGFNI_2x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z11 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 4 outputs + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x4_64_loop + VZEROUPPER + +mulGFNI_2x4_64_end: + RET + +// func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4_loop + VZEROUPPER + +mulAvxGFNI_2x4_end: + RET + +// func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulGFNI_2x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (SI), Z8 + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (BX), Z11 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 4 outputs + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x4_64Xor_loop + VZEROUPPER + +mulGFNI_2x4_64Xor_end: + RET + +// func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4Xor_loop: + // Load 4 outputs + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (BX), Y11 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4Xor_loop + VZEROUPPER + +mulAvxGFNI_2x4Xor_end: + RET + +// func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 25 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_2x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (SI), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x4Xor_loop + VZEROUPPER + +mulAvxTwo_2x4Xor_end: + RET + +// func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_2x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x5_loop + VZEROUPPER + +mulAvxTwo_2x5_end: + RET + +// func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, CX + +mulGFNI_2x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z14 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 5 outputs + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x5_64_loop + VZEROUPPER + +mulGFNI_2x5_64_end: + RET + +// func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5_loop + VZEROUPPER + +mulAvxGFNI_2x5_end: + RET + +// func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, CX + +mulGFNI_2x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (SI), Z10 + VMOVDQU64 (DI), Z11 + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (BX), Z14 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 5 outputs + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x5_64Xor_loop + VZEROUPPER + +mulGFNI_2x5_64Xor_end: + RET + +// func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5Xor(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5Xor_loop: + // Load 5 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5Xor_loop + VZEROUPPER + +mulAvxGFNI_2x5Xor_end: + RET + +// func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_2x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU (SI), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x5Xor_loop + VZEROUPPER + +mulAvxTwo_2x5Xor_end: + RET + +// func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x6(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_2x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x6_loop + VZEROUPPER + +mulAvxTwo_2x6_end: + RET + +// func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, CX + +mulGFNI_2x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z17 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 6 outputs + VMOVDQU64 Z12, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z13, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z14, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x6_64_loop + VZEROUPPER + +mulGFNI_2x6_64_end: + RET + +// func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6_loop + VZEROUPPER + +mulAvxGFNI_2x6_end: + RET + +// func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, CX + +mulGFNI_2x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (SI), Z12 + VMOVDQU64 (DI), Z13 + VMOVDQU64 (R8), Z14 + VMOVDQU64 (R9), Z15 + VMOVDQU64 (R10), Z16 + VMOVDQU64 (BX), Z17 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 6 outputs + VMOVDQU64 Z12, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z13, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z14, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x6_64Xor_loop + VZEROUPPER + +mulGFNI_2x6_64Xor_end: + RET + +// func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6Xor(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6Xor_loop: + // Load 6 outputs + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6Xor_loop + VZEROUPPER + +mulAvxGFNI_2x6Xor_end: + RET + +// func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_2x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU (SI), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x6Xor_loop + VZEROUPPER + +mulAvxTwo_2x6Xor_end: + RET + +// func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x7(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 40 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_2x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x7_loop + VZEROUPPER + +mulAvxTwo_2x7_end: + RET + +// func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, CX + +mulGFNI_2x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z19 + VGF2P8AFFINEQB $0x00, Z6, Z21, Z20 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 7 outputs + VMOVDQU64 Z14, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x7_64_loop + VZEROUPPER + +mulGFNI_2x7_64_end: + RET + +// func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7_loop + VZEROUPPER + +mulAvxGFNI_2x7_end: + RET + +// func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, CX + +mulGFNI_2x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (SI), Z14 + VMOVDQU64 (DI), Z15 + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (R11), Z19 + VMOVDQU64 (BX), Z20 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 7 outputs + VMOVDQU64 Z14, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x7_64Xor_loop + VZEROUPPER + +mulGFNI_2x7_64Xor_end: + RET + +// func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7Xor(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7Xor_loop: + // Load 7 outputs + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7Xor_loop + VZEROUPPER + +mulAvxGFNI_2x7Xor_end: + RET + +// func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 40 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_2x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU (SI), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x7Xor_loop + VZEROUPPER + +mulAvxTwo_2x7Xor_end: + RET + +// func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x8(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 45 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_2x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x8_loop + VZEROUPPER + +mulAvxTwo_2x8_end: + RET + +// func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x8_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, CX + +mulGFNI_2x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z19 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z23 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 8 outputs + VMOVDQU64 Z16, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z17, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z18, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z19, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z20, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x8_64_loop + VZEROUPPER + +mulGFNI_2x8_64_end: + RET + +// func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8_loop + VZEROUPPER + +mulAvxGFNI_2x8_end: + RET + +// func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x8_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, CX + +mulGFNI_2x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (SI), Z16 + VMOVDQU64 (DI), Z17 + VMOVDQU64 (R8), Z18 + VMOVDQU64 (R9), Z19 + VMOVDQU64 (R10), Z20 + VMOVDQU64 (R11), Z21 + VMOVDQU64 (R12), Z22 + VMOVDQU64 (BX), Z23 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 8 outputs + VMOVDQU64 Z16, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z17, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z18, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z19, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z20, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x8_64Xor_loop + VZEROUPPER + +mulGFNI_2x8_64Xor_end: + RET + +// func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8Xor(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8Xor_loop: + // Load 8 outputs + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8Xor_loop + VZEROUPPER + +mulAvxGFNI_2x8Xor_end: + RET + +// func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 45 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_2x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU (R13), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU (SI), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x8Xor_loop + VZEROUPPER + +mulAvxTwo_2x8Xor_end: + RET + +// func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x9(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_2x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x9_loop + VZEROUPPER + +mulAvxTwo_2x9_end: + RET + +// func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x9_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, CX + +mulGFNI_2x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z27, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z23 + VGF2P8AFFINEQB $0x00, Z6, Z27, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z26 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 9 outputs + VMOVDQU64 Z18, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x9_64_loop + VZEROUPPER + +mulGFNI_2x9_64_end: + RET + +// func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9_loop + VZEROUPPER + +mulAvxGFNI_2x9_end: + RET + +// func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x9_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, CX + +mulGFNI_2x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (SI), Z18 + VMOVDQU64 (DI), Z19 + VMOVDQU64 (R8), Z20 + VMOVDQU64 (R9), Z21 + VMOVDQU64 (R10), Z22 + VMOVDQU64 (R11), Z23 + VMOVDQU64 (R12), Z24 + VMOVDQU64 (R13), Z25 + VMOVDQU64 (BX), Z26 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 9 outputs + VMOVDQU64 Z18, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x9_64Xor_loop + VZEROUPPER + +mulGFNI_2x9_64Xor_end: + RET + +// func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9Xor(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9Xor_loop: + // Load 9 outputs + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9Xor_loop + VZEROUPPER + +mulAvxGFNI_2x9Xor_end: + RET + +// func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_2x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU (R13), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU (R14), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU (SI), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x9Xor_loop + VZEROUPPER + +mulAvxTwo_2x9Xor_end: + RET + +// func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x10(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 55 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_2x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (R15) + ADDQ $0x20, R15 + VMOVDQU Y9, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x10_loop + VZEROUPPER + +mulAvxTwo_2x10_end: + RET + +// func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x10_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, CX + +mulGFNI_2x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x10_64_loop + VZEROUPPER + +mulGFNI_2x10_64_end: + RET + +// func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10_loop + VZEROUPPER + +mulAvxGFNI_2x10_end: + RET + +// func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, CX + +mulGFNI_2x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (SI), Z20 + VMOVDQU64 (DI), Z21 + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (BX), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x10_64Xor_loop + VZEROUPPER + +mulGFNI_2x10_64Xor_end: + RET + +// func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10Xor(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10Xor_loop + VZEROUPPER + +mulAvxGFNI_2x10Xor_end: + RET + +// func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_2x10Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 55 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_2x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU (R13), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU (R14), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU (R15), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU (SI), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (R15) + ADDQ $0x20, R15 + VMOVDQU Y9, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x10Xor_loop + VZEROUPPER + +mulAvxTwo_2x10Xor_end: + RET + +// func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_3x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_3x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x1_64_loop + VZEROUPPER + +mulAvxTwo_3x1_64_end: + RET + +// func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulGFNI_3x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z3 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Store 1 outputs + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x1_64_loop + VZEROUPPER + +mulGFNI_3x1_64_end: + RET + +// func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x1_loop + VZEROUPPER + +mulAvxGFNI_3x1_end: + RET + +// func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulGFNI_3x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (SI), Z3 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Store 1 outputs + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x1_64Xor_loop + VZEROUPPER + +mulGFNI_3x1_64Xor_end: + RET + +// func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1Xor_loop: + // Load 1 outputs + VMOVDQU (SI), Y3 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x1Xor_loop + VZEROUPPER + +mulAvxGFNI_3x1Xor_end: + RET + +// func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_3x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_3x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_3x1_64Xor_end: + RET + +// func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 33 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_3x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_3x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x2_64_loop + VZEROUPPER + +mulAvxTwo_3x2_64_end: + RET + +// func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulGFNI_3x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z7 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 2 outputs + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x2_64_loop + VZEROUPPER + +mulGFNI_3x2_64_end: + RET + +// func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x2_loop + VZEROUPPER + +mulAvxGFNI_3x2_end: + RET + +// func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulGFNI_3x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (DI), Z6 + VMOVDQU64 (SI), Z7 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 2 outputs + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x2_64Xor_loop + VZEROUPPER + +mulGFNI_3x2_64Xor_end: + RET + +// func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2Xor_loop: + // Load 2 outputs + VMOVDQU (DI), Y6 + VMOVDQU (SI), Y7 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x2Xor_loop + VZEROUPPER + +mulAvxGFNI_3x2Xor_end: + RET + +// func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 33 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_3x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_3x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_3x2_64Xor_end: + RET + +// func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x3_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_3x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_3x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y4, (DI) + VMOVDQU Y5, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x3_64_loop + VZEROUPPER + +mulAvxTwo_3x3_64_end: + RET + +// func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulGFNI_3x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z11 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 3 outputs + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x3_64_loop + VZEROUPPER + +mulGFNI_3x3_64_end: + RET + +// func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x3_loop + VZEROUPPER + +mulAvxGFNI_3x3_end: + RET + +// func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulGFNI_3x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (SI), Z11 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 3 outputs + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x3_64Xor_loop + VZEROUPPER + +mulGFNI_3x3_64Xor_end: + RET + +// func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3Xor_loop: + // Load 3 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (SI), Y11 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x3Xor_loop + VZEROUPPER + +mulAvxGFNI_3x3Xor_end: + RET + +// func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_3x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_3x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU (R9), Y2 + VMOVDQU 32(R9), Y3 + VMOVDQU (DI), Y4 + VMOVDQU 32(DI), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y4, (DI) + VMOVDQU Y5, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_3x3_64Xor_end: + RET + +// func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x4(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 33 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_3x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x4_loop + VZEROUPPER + +mulAvxTwo_3x4_end: + RET + +// func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, CX + +mulGFNI_3x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z15 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 4 outputs + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x4_64_loop + VZEROUPPER + +mulGFNI_3x4_64_end: + RET + +// func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4_loop + VZEROUPPER + +mulAvxGFNI_3x4_end: + RET + +// func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, CX + +mulGFNI_3x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (SI), Z15 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 4 outputs + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x4_64Xor_loop + VZEROUPPER + +mulGFNI_3x4_64Xor_end: + RET + +// func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4Xor(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4Xor_loop: + // Load 4 outputs + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4Xor_loop + VZEROUPPER + +mulAvxGFNI_3x4Xor_end: + RET + +// func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 33 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_3x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (DI), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x4Xor_loop + VZEROUPPER + +mulAvxTwo_3x4Xor_end: + RET + +// func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 40 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_3x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x5_loop + VZEROUPPER + +mulAvxTwo_3x5_end: + RET + +// func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, CX + +mulGFNI_3x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z17 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z19 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 5 outputs + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x5_64_loop + VZEROUPPER + +mulGFNI_3x5_64_end: + RET + +// func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5_loop + VZEROUPPER + +mulAvxGFNI_3x5_end: + RET + +// func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, CX + +mulGFNI_3x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (DI), Z15 + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (SI), Z19 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 5 outputs + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x5_64Xor_loop + VZEROUPPER + +mulGFNI_3x5_64Xor_end: + RET + +// func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5Xor(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5Xor_loop: + // Load 5 outputs + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5Xor_loop + VZEROUPPER + +mulAvxGFNI_3x5Xor_end: + RET + +// func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 40 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_3x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU (DI), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x5Xor_loop + VZEROUPPER + +mulAvxTwo_3x5Xor_end: + RET + +// func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x6(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_3x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x6_loop + VZEROUPPER + +mulAvxTwo_3x6_end: + RET + +// func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, CX + +mulGFNI_3x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z23 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 6 outputs + VMOVDQU64 Z18, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z19, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x6_64_loop + VZEROUPPER + +mulGFNI_3x6_64_end: + RET + +// func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6_loop + VZEROUPPER + +mulAvxGFNI_3x6_end: + RET + +// func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, CX + +mulGFNI_3x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (DI), Z18 + VMOVDQU64 (R8), Z19 + VMOVDQU64 (R9), Z20 + VMOVDQU64 (R10), Z21 + VMOVDQU64 (R11), Z22 + VMOVDQU64 (SI), Z23 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 6 outputs + VMOVDQU64 Z18, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z19, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x6_64Xor_loop + VZEROUPPER + +mulGFNI_3x6_64Xor_end: + RET + +// func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6Xor(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6Xor_loop: + // Load 6 outputs + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6Xor_loop + VZEROUPPER + +mulAvxGFNI_3x6Xor_end: + RET + +// func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_3x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU (DI), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x6Xor_loop + VZEROUPPER + +mulAvxTwo_3x6Xor_end: + RET + +// func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x7(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 54 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_3x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x7_loop + VZEROUPPER + +mulAvxTwo_3x7_end: + RET + +// func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, CX + +mulGFNI_3x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z27 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 7 outputs + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x7_64_loop + VZEROUPPER + +mulGFNI_3x7_64_end: + RET + +// func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7_loop + VZEROUPPER + +mulAvxGFNI_3x7_end: + RET + +// func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, CX + +mulGFNI_3x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (DI), Z21 + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (SI), Z27 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 7 outputs + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x7_64Xor_loop + VZEROUPPER + +mulGFNI_3x7_64Xor_end: + RET + +// func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7Xor(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7Xor_loop: + // Load 7 outputs + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7Xor_loop + VZEROUPPER + +mulAvxGFNI_3x7Xor_end: + RET + +// func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 54 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_3x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU (DI), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x7Xor_loop + VZEROUPPER + +mulAvxTwo_3x7Xor_end: + RET + +// func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x8(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 61 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_3x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x8_loop + VZEROUPPER + +mulAvxTwo_3x8_end: + RET + +// func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x8_64(SB), $0-88 + // Loading 22 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulGFNI_3x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x8_64_loop + VZEROUPPER + +mulGFNI_3x8_64_end: + RET + +// func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8_loop + VZEROUPPER + +mulAvxGFNI_3x8_end: + RET + +// func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x8_64Xor(SB), $0-88 + // Loading 22 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulGFNI_3x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x8_64Xor_loop + VZEROUPPER + +mulGFNI_3x8_64Xor_end: + RET + +// func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8Xor(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8Xor_loop: + // Load 8 outputs + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8Xor_loop + VZEROUPPER + +mulAvxGFNI_3x8Xor_end: + RET + +// func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 61 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_3x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU (R14), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU (DI), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x8Xor_loop + VZEROUPPER + +mulAvxTwo_3x8Xor_end: + RET + +// func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x9(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_3x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (R15) + ADDQ $0x20, R15 + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x9_loop + VZEROUPPER + +mulAvxTwo_3x9_end: + RET + +// func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x9_64(SB), $8-88 + // Loading 21 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulGFNI_3x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x9_64_loop + VZEROUPPER + +mulGFNI_3x9_64_end: + RET + +// func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9_loop + VZEROUPPER + +mulAvxGFNI_3x9_end: + RET + +// func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x9_64Xor(SB), $8-88 + // Loading 21 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulGFNI_3x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x9_64Xor_loop + VZEROUPPER + +mulGFNI_3x9_64Xor_end: + RET + +// func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9Xor(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9Xor_loop + VZEROUPPER + +mulAvxGFNI_3x9Xor_end: + RET + +// func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x9Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_3x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU (R14), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU (R15), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU (DI), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (R15) + ADDQ $0x20, R15 + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x9Xor_loop + VZEROUPPER + +mulAvxTwo_3x9Xor_end: + RET + +// func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x10(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 75 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x10_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_3x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y13 + ADDQ $0x20, AX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (R15) + ADDQ $0x20, R15 + VMOVDQU Y9, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_3x10_loop + VZEROUPPER + +mulAvxTwo_3x10_end: + RET + +// func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x10_64(SB), $8-88 + // Loading 20 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_3x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_3x10_64_loop + VZEROUPPER + +mulGFNI_3x10_64_end: + RET + +// func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10_loop + VZEROUPPER + +mulAvxGFNI_3x10_end: + RET + +// func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x10_64Xor(SB), $8-88 + // Loading 20 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_3x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (DI), Z20 + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (SI), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_3x10_64Xor_loop + VZEROUPPER + +mulGFNI_3x10_64Xor_end: + RET + +// func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10Xor(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10Xor_loop + VZEROUPPER + +mulAvxGFNI_3x10Xor_end: + RET + +// func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_3x10Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 75 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x10Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_3x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU (R13), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU (R14), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU (R15), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU (SI), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y13 + ADDQ $0x20, AX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (R15) + ADDQ $0x20, R15 + VMOVDQU Y9, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_3x10Xor_loop + VZEROUPPER + +mulAvxTwo_3x10Xor_end: + RET + +// func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_4x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x1_64_loop + VZEROUPPER + +mulAvxTwo_4x1_64_end: + RET + +// func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulGFNI_4x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z4 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Store 1 outputs + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x1_64_loop + VZEROUPPER + +mulGFNI_4x1_64_end: + RET + +// func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x1_loop + VZEROUPPER + +mulAvxGFNI_4x1_end: + RET + +// func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulGFNI_4x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (DI), Z4 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Store 1 outputs + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x1_64Xor_loop + VZEROUPPER + +mulGFNI_4x1_64Xor_end: + RET + +// func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1Xor_loop: + // Load 1 outputs + VMOVDQU (DI), Y4 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x1Xor_loop + VZEROUPPER + +mulAvxGFNI_4x1Xor_end: + RET + +// func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_4x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_4x1_64Xor_end: + RET + +// func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 41 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_4x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x2_64_loop + VZEROUPPER + +mulAvxTwo_4x2_64_end: + RET + +// func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulGFNI_4x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z9 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 2 outputs + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x2_64_loop + VZEROUPPER + +mulGFNI_4x2_64_end: + RET + +// func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x2_loop + VZEROUPPER + +mulAvxGFNI_4x2_end: + RET + +// func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulGFNI_4x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R8), Z8 + VMOVDQU64 (DI), Z9 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 2 outputs + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x2_64Xor_loop + VZEROUPPER + +mulGFNI_4x2_64Xor_end: + RET + +// func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2Xor_loop: + // Load 2 outputs + VMOVDQU (R8), Y8 + VMOVDQU (DI), Y9 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x2Xor_loop + VZEROUPPER + +mulAvxGFNI_4x2Xor_end: + RET + +// func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 41 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_4x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R9), Y0 + VMOVDQU 32(R9), Y1 + VMOVDQU (R8), Y2 + VMOVDQU 32(R8), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_4x2_64Xor_end: + RET + +// func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x3_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_4x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x3_64_loop + VZEROUPPER + +mulAvxTwo_4x3_64_end: + RET + +// func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, CX + +mulGFNI_4x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z14 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z15 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z15 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z10, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z11, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 3 outputs + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x3_64_loop + VZEROUPPER + +mulGFNI_4x3_64_end: + RET + +// func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x3_loop + VZEROUPPER + +mulAvxGFNI_4x3_end: + RET + +// func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, CX + +mulGFNI_4x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (DI), Z14 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z15 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z15 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z10, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z11, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 3 outputs + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x3_64Xor_loop + VZEROUPPER + +mulGFNI_4x3_64Xor_end: + RET + +// func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3Xor(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3Xor_loop: + // Load 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x3Xor_loop + VZEROUPPER + +mulAvxGFNI_4x3Xor_end: + RET + +// func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_4x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R9), Y0 + VMOVDQU 32(R9), Y1 + VMOVDQU (R10), Y2 + VMOVDQU 32(R10), Y3 + VMOVDQU (R8), Y4 + VMOVDQU 32(R8), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_4x3_64Xor_end: + RET + +// func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x4(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 41 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_4x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x4_loop + VZEROUPPER + +mulAvxTwo_4x4_end: + RET + +// func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, CX + +mulGFNI_4x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z19 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 4 outputs + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x4_64_loop + VZEROUPPER + +mulGFNI_4x4_64_end: + RET + +// func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4_loop + VZEROUPPER + +mulAvxGFNI_4x4_end: + RET + +// func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, CX + +mulGFNI_4x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (DI), Z19 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 4 outputs + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x4_64Xor_loop + VZEROUPPER + +mulGFNI_4x4_64Xor_end: + RET + +// func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4Xor(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4Xor_loop: + // Load 4 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4Xor_loop + VZEROUPPER + +mulAvxGFNI_4x4Xor_end: + RET + +// func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 41 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_4x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (R8), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x4Xor_loop + VZEROUPPER + +mulAvxTwo_4x4Xor_end: + RET + +// func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_4x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x5_loop + VZEROUPPER + +mulAvxTwo_4x5_end: + RET + +// func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, CX + +mulGFNI_4x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z25 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z25, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z25, Z24 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z25 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z6, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z9, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z25 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (CX), Z25 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z16, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z17, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z18, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z19, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Store 5 outputs + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x5_64_loop + VZEROUPPER + +mulGFNI_4x5_64_end: + RET + +// func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5_loop + VZEROUPPER + +mulAvxGFNI_4x5_end: + RET + +// func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, CX + +mulGFNI_4x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R8), Z20 + VMOVDQU64 (R9), Z21 + VMOVDQU64 (R10), Z22 + VMOVDQU64 (R11), Z23 + VMOVDQU64 (DI), Z24 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z25 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z25 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z6, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z9, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z25 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (CX), Z25 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z16, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z17, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z18, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z19, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Store 5 outputs + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x5_64Xor_loop + VZEROUPPER + +mulGFNI_4x5_64Xor_end: + RET + +// func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5Xor(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5Xor_loop: + // Load 5 outputs + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5Xor_loop + VZEROUPPER + +mulAvxGFNI_4x5Xor_end: + RET + +// func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_4x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU (R8), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x5Xor_loop + VZEROUPPER + +mulAvxTwo_4x5Xor_end: + RET + +// func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x6(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 59 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_4x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x6_loop + VZEROUPPER + +mulAvxTwo_4x6_end: + RET + +// func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, CX + +mulGFNI_4x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x6_64_loop + VZEROUPPER + +mulGFNI_4x6_64_end: + RET + +// func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6_loop + VZEROUPPER + +mulAvxGFNI_4x6_end: + RET + +// func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, CX + +mulGFNI_4x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R8), Z24 + VMOVDQU64 (R9), Z25 + VMOVDQU64 (R10), Z26 + VMOVDQU64 (R11), Z27 + VMOVDQU64 (R12), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x6_64Xor_loop + VZEROUPPER + +mulGFNI_4x6_64Xor_end: + RET + +// func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6Xor(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6Xor_loop: + // Load 6 outputs + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6Xor_loop + VZEROUPPER + +mulAvxGFNI_4x6Xor_end: + RET + +// func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 59 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_4x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU (R8), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x6Xor_loop + VZEROUPPER + +mulAvxTwo_4x6Xor_end: + RET + +// func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x7(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_4x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x7_loop + VZEROUPPER + +mulAvxTwo_4x7_end: + RET + +// func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x7_64(SB), $0-88 + // Loading 23 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulGFNI_4x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x7_64_loop + VZEROUPPER + +mulGFNI_4x7_64_end: + RET + +// func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7_loop + VZEROUPPER + +mulAvxGFNI_4x7_end: + RET + +// func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x7_64Xor(SB), $0-88 + // Loading 23 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulGFNI_4x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x7_64Xor_loop + VZEROUPPER + +mulGFNI_4x7_64Xor_end: + RET + +// func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7Xor(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7Xor_loop: + // Load 7 outputs + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7Xor_loop + VZEROUPPER + +mulAvxGFNI_4x7Xor_end: + RET + +// func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_4x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU (R14), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU (R8), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x7Xor_loop + VZEROUPPER + +mulAvxTwo_4x7Xor_end: + RET + +// func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x8(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 77 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_4x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R15) + ADDQ $0x20, R15 + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x8_loop + VZEROUPPER + +mulAvxTwo_4x8_end: + RET + +// func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x8_64(SB), $8-88 + // Loading 22 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulGFNI_4x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x8_64_loop + VZEROUPPER + +mulGFNI_4x8_64_end: + RET + +// func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8_loop + VZEROUPPER + +mulAvxGFNI_4x8_end: + RET + +// func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x8_64Xor(SB), $8-88 + // Loading 22 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulGFNI_4x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x8_64Xor_loop + VZEROUPPER + +mulGFNI_4x8_64Xor_end: + RET + +// func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8Xor(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8Xor_loop + VZEROUPPER + +mulAvxGFNI_4x8Xor_end: + RET + +// func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x8Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 77 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_4x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU (R14), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU (R15), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU (R8), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R15) + ADDQ $0x20, R15 + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x8Xor_loop + VZEROUPPER + +mulAvxTwo_4x8Xor_end: + RET + +// func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x9(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 86 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x9_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X9 + VPBROADCASTB X9, Y9 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_4x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y12 + ADDQ $0x20, AX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (R15) + ADDQ $0x20, R15 + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_4x9_loop + VZEROUPPER + +mulAvxTwo_4x9_end: + RET + +// func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x9_64(SB), $8-88 + // Loading 21 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_4x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_4x9_64_loop + VZEROUPPER + +mulGFNI_4x9_64_end: + RET + +// func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9_loop + VZEROUPPER + +mulAvxGFNI_4x9_end: + RET + +// func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x9_64Xor(SB), $8-88 + // Loading 21 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_4x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_4x9_64Xor_loop + VZEROUPPER + +mulGFNI_4x9_64Xor_end: + RET + +// func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9Xor(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9Xor_loop + VZEROUPPER + +mulAvxGFNI_4x9Xor_end: + RET + +// func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x9Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 86 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x9Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X9 + VPBROADCASTB X9, Y9 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_4x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU (R14), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU (R15), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU (DI), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y12 + ADDQ $0x20, AX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (R15) + ADDQ $0x20, R15 + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_4x9Xor_loop + VZEROUPPER + +mulAvxTwo_4x9Xor_end: + RET + +// func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 95 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_4x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y0, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y1, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y2, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y3, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxTwo_4x10_loop + VZEROUPPER + +mulAvxTwo_4x10_end: + RET + +// func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x10_64(SB), $0-88 + // Loading 20 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z20, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z21, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z22, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z23, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z24, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z25, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z26, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z27, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z28, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z29, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64_loop + VZEROUPPER + +mulGFNI_4x10_64_end: + RET + +// func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvxGFNI_4x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxGFNI_4x10_loop + VZEROUPPER + +mulAvxGFNI_4x10_end: + RET + +// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 + // Loading 20 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU64 (R10)(R9*1), Z20 + MOVQ 24(R8), R10 + VMOVDQU64 (R10)(R9*1), Z21 + MOVQ 48(R8), R10 + VMOVDQU64 (R10)(R9*1), Z22 + MOVQ 72(R8), R10 + VMOVDQU64 (R10)(R9*1), Z23 + MOVQ 96(R8), R10 + VMOVDQU64 (R10)(R9*1), Z24 + MOVQ 120(R8), R10 + VMOVDQU64 (R10)(R9*1), Z25 + MOVQ 144(R8), R10 + VMOVDQU64 (R10)(R9*1), Z26 + MOVQ 168(R8), R10 + VMOVDQU64 (R10)(R9*1), Z27 + MOVQ 192(R8), R10 + VMOVDQU64 (R10)(R9*1), Z28 + MOVQ 216(R8), R10 + VMOVDQU64 (R10)(R9*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z20, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z21, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z22, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z23, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z24, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z25, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z26, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z27, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z28, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z29, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64Xor_loop + VZEROUPPER + +mulGFNI_4x10_64Xor_end: + RET + +// func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10Xor(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvxGFNI_4x10Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU (R10)(R9*1), Y4 + MOVQ 24(R8), R10 + VMOVDQU (R10)(R9*1), Y5 + MOVQ 48(R8), R10 + VMOVDQU (R10)(R9*1), Y6 + MOVQ 72(R8), R10 + VMOVDQU (R10)(R9*1), Y7 + MOVQ 96(R8), R10 + VMOVDQU (R10)(R9*1), Y8 + MOVQ 120(R8), R10 + VMOVDQU (R10)(R9*1), Y9 + MOVQ 144(R8), R10 + VMOVDQU (R10)(R9*1), Y10 + MOVQ 168(R8), R10 + VMOVDQU (R10)(R9*1), Y11 + MOVQ 192(R8), R10 + VMOVDQU (R10)(R9*1), Y12 + MOVQ 216(R8), R10 + VMOVDQU (R10)(R9*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxGFNI_4x10Xor_loop + VZEROUPPER + +mulAvxGFNI_4x10Xor_end: + RET + +// func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 95 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_4x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R8), R10 + VMOVDQU (R10)(R9*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + MOVQ 24(R8), R10 + VMOVDQU (R10)(R9*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + MOVQ 48(R8), R10 + VMOVDQU (R10)(R9*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + MOVQ 72(R8), R10 + VMOVDQU (R10)(R9*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + MOVQ 96(R8), R10 + VMOVDQU (R10)(R9*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + MOVQ 120(R8), R10 + VMOVDQU (R10)(R9*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + MOVQ 144(R8), R10 + VMOVDQU (R10)(R9*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + MOVQ 168(R8), R10 + VMOVDQU (R10)(R9*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + MOVQ 192(R8), R10 + VMOVDQU (R10)(R9*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + MOVQ 216(R8), R10 + VMOVDQU (R10)(R9*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y0, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y1, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y2, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y3, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxTwo_4x10Xor_loop + VZEROUPPER + +mulAvxTwo_4x10Xor_end: + RET + +// func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_5x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x1_64_loop + VZEROUPPER + +mulAvxTwo_5x1_64_end: + RET + +// func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulGFNI_5x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Store 1 outputs + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x1_64_loop + VZEROUPPER + +mulGFNI_5x1_64_end: + RET + +// func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x1_loop + VZEROUPPER + +mulAvxGFNI_5x1_end: + RET + +// func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulGFNI_5x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R8), Z5 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Store 1 outputs + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x1_64Xor_loop + VZEROUPPER + +mulGFNI_5x1_64Xor_end: + RET + +// func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1Xor_loop: + // Load 1 outputs + VMOVDQU (R8), Y5 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x1Xor_loop + VZEROUPPER + +mulAvxGFNI_5x1Xor_end: + RET + +// func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_5x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R9), Y0 + VMOVDQU 32(R9), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_5x1_64Xor_end: + RET + +// func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 49 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_5x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x2_64_loop + VZEROUPPER + +mulAvxTwo_5x2_64_end: + RET + +// func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulGFNI_5x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z11 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z12 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z12 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z9, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 2 outputs + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x2_64_loop + VZEROUPPER + +mulGFNI_5x2_64_end: + RET + +// func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x2_loop + VZEROUPPER + +mulAvxGFNI_5x2_end: + RET + +// func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulGFNI_5x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R9), Z10 + VMOVDQU64 (R8), Z11 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z12 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z12 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z9, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 2 outputs + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x2_64Xor_loop + VZEROUPPER + +mulGFNI_5x2_64Xor_end: + RET + +// func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2Xor_loop: + // Load 2 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R8), Y11 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x2Xor_loop + VZEROUPPER + +mulAvxGFNI_5x2Xor_end: + RET + +// func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 49 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_5x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R10), Y0 + VMOVDQU 32(R10), Y1 + VMOVDQU (R9), Y2 + VMOVDQU 32(R9), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_5x2_64Xor_end: + RET + +// func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x3_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 70 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_5x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y4, (R9) + VMOVDQU Y5, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x3_64_loop + VZEROUPPER + +mulAvxTwo_5x3_64_end: + RET + +// func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, CX + +mulGFNI_5x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z17 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 3 outputs + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x3_64_loop + VZEROUPPER + +mulGFNI_5x3_64_end: + RET + +// func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x3_loop + VZEROUPPER + +mulAvxGFNI_5x3_end: + RET + +// func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, CX + +mulGFNI_5x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R9), Z15 + VMOVDQU64 (R10), Z16 + VMOVDQU64 (R8), Z17 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 3 outputs + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x3_64Xor_loop + VZEROUPPER + +mulGFNI_5x3_64Xor_end: + RET + +// func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3Xor(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3Xor_loop: + // Load 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x3Xor_loop + VZEROUPPER + +mulAvxGFNI_5x3Xor_end: + RET + +// func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 70 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_5x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R10), Y0 + VMOVDQU 32(R10), Y1 + VMOVDQU (R11), Y2 + VMOVDQU 32(R11), Y3 + VMOVDQU (R9), Y4 + VMOVDQU 32(R9), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y4, (R9) + VMOVDQU Y5, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_5x3_64Xor_end: + RET + +// func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x4(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 49 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_5x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x4_loop + VZEROUPPER + +mulAvxTwo_5x4_end: + RET + +// func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, CX + +mulGFNI_5x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z23 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 4 outputs + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x4_64_loop + VZEROUPPER + +mulGFNI_5x4_64_end: + RET + +// func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4_loop + VZEROUPPER + +mulAvxGFNI_5x4_end: + RET + +// func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, CX + +mulGFNI_5x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R9), Z20 + VMOVDQU64 (R10), Z21 + VMOVDQU64 (R11), Z22 + VMOVDQU64 (R8), Z23 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 4 outputs + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x4_64Xor_loop + VZEROUPPER + +mulGFNI_5x4_64Xor_end: + RET + +// func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4Xor(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4Xor_loop: + // Load 4 outputs + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4Xor_loop + VZEROUPPER + +mulAvxGFNI_5x4Xor_end: + RET + +// func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 49 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_5x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (R9), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x4Xor_loop + VZEROUPPER + +mulAvxTwo_5x4Xor_end: + RET + +// func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 60 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_5x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x5_loop + VZEROUPPER + +mulAvxTwo_5x5_end: + RET + +// func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, CX + +mulGFNI_5x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x5_64_loop + VZEROUPPER + +mulGFNI_5x5_64_end: + RET + +// func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5_loop + VZEROUPPER + +mulAvxGFNI_5x5_end: + RET + +// func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, CX + +mulGFNI_5x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R9), Z25 + VMOVDQU64 (R10), Z26 + VMOVDQU64 (R11), Z27 + VMOVDQU64 (R12), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x5_64Xor_loop + VZEROUPPER + +mulGFNI_5x5_64Xor_end: + RET + +// func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5Xor(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5Xor_loop: + // Load 5 outputs + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5Xor_loop + VZEROUPPER + +mulAvxGFNI_5x5Xor_end: + RET + +// func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 60 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_5x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU (R13), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU (R9), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x5Xor_loop + VZEROUPPER + +mulAvxTwo_5x5Xor_end: + RET + +// func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x6(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 71 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_5x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x6_loop + VZEROUPPER + +mulAvxTwo_5x6_end: + RET + +// func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x6_64(SB), $0-88 + // Loading 24 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulGFNI_5x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x6_64_loop + VZEROUPPER + +mulGFNI_5x6_64_end: + RET + +// func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6_loop + VZEROUPPER + +mulAvxGFNI_5x6_end: + RET + +// func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x6_64Xor(SB), $0-88 + // Loading 24 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulGFNI_5x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x6_64Xor_loop + VZEROUPPER + +mulGFNI_5x6_64Xor_end: + RET + +// func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6Xor(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6Xor_loop: + // Load 6 outputs + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6Xor_loop + VZEROUPPER + +mulAvxGFNI_5x6Xor_end: + RET + +// func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 71 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_5x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU (R13), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU (R14), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU (R9), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x6Xor_loop + VZEROUPPER + +mulAvxTwo_5x6Xor_end: + RET + +// func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x7(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_5x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R15) + ADDQ $0x20, R15 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x7_loop + VZEROUPPER + +mulAvxTwo_5x7_end: + RET + +// func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x7_64(SB), $8-88 + // Loading 23 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulGFNI_5x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x7_64_loop + VZEROUPPER + +mulGFNI_5x7_64_end: + RET + +// func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7_loop + VZEROUPPER + +mulAvxGFNI_5x7_end: + RET + +// func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x7_64Xor(SB), $8-88 + // Loading 23 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulGFNI_5x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x7_64Xor_loop + VZEROUPPER + +mulGFNI_5x7_64Xor_end: + RET + +// func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7Xor(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7Xor_loop + VZEROUPPER + +mulAvxGFNI_5x7Xor_end: + RET + +// func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x7Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_5x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU (R13), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU (R14), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU (R15), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU (R9), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R15) + ADDQ $0x20, R15 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x7Xor_loop + VZEROUPPER + +mulAvxTwo_5x7Xor_end: + RET + +// func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x8(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 93 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x8_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_5x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y11 + ADDQ $0x20, AX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R15) + ADDQ $0x20, R15 + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_5x8_loop + VZEROUPPER + +mulAvxTwo_5x8_end: + RET + +// func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x8_64(SB), $8-88 + // Loading 22 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_5x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_5x8_64_loop + VZEROUPPER + +mulGFNI_5x8_64_end: + RET + +// func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8_loop + VZEROUPPER + +mulAvxGFNI_5x8_end: + RET + +// func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x8_64Xor(SB), $8-88 + // Loading 22 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_5x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_5x8_64Xor_loop + VZEROUPPER + +mulGFNI_5x8_64Xor_end: + RET + +// func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8Xor(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8Xor_loop + VZEROUPPER + +mulAvxGFNI_5x8Xor_end: + RET + +// func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x8Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 93 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x8Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_5x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU (R14), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU (R15), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU (R8), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y11 + ADDQ $0x20, AX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R15) + ADDQ $0x20, R15 + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_5x8Xor_loop + VZEROUPPER + +mulAvxTwo_5x8Xor_end: + RET + +// func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x9(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 104 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_5x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_5x9_loop + VZEROUPPER + +mulAvxTwo_5x9_end: + RET + +// func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x9_64(SB), $0-88 + // Loading 21 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x9_64_loop + VZEROUPPER + +mulGFNI_5x9_64_end: + RET + +// func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9_loop + VZEROUPPER + +mulAvxGFNI_5x9_end: + RET + +// func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x9_64Xor(SB), $0-88 + // Loading 21 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x9_64Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU64 (R11)(R10*1), Z21 + MOVQ 24(R9), R11 + VMOVDQU64 (R11)(R10*1), Z22 + MOVQ 48(R9), R11 + VMOVDQU64 (R11)(R10*1), Z23 + MOVQ 72(R9), R11 + VMOVDQU64 (R11)(R10*1), Z24 + MOVQ 96(R9), R11 + VMOVDQU64 (R11)(R10*1), Z25 + MOVQ 120(R9), R11 + VMOVDQU64 (R11)(R10*1), Z26 + MOVQ 144(R9), R11 + VMOVDQU64 (R11)(R10*1), Z27 + MOVQ 168(R9), R11 + VMOVDQU64 (R11)(R10*1), Z28 + MOVQ 192(R9), R11 + VMOVDQU64 (R11)(R10*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x9_64Xor_loop + VZEROUPPER + +mulGFNI_5x9_64Xor_end: + RET + +// func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9Xor(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9Xor_loop + VZEROUPPER + +mulAvxGFNI_5x9Xor_end: + RET + +// func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 104 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_5x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_5x9Xor_loop + VZEROUPPER + +mulAvxTwo_5x9Xor_end: + RET + +// func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 115 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_5x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_5x10_loop + VZEROUPPER + +mulAvxTwo_5x10_end: + RET + +// func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x10_64(SB), $0-88 + // Loading 20 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU64 Z20, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x10_64_loop + VZEROUPPER + +mulGFNI_5x10_64_end: + RET + +// func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10_loop + VZEROUPPER + +mulAvxGFNI_5x10_end: + RET + +// func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x10_64Xor(SB), $0-88 + // Loading 20 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x10_64Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU64 (R11)(R10*1), Z20 + MOVQ 24(R9), R11 + VMOVDQU64 (R11)(R10*1), Z21 + MOVQ 48(R9), R11 + VMOVDQU64 (R11)(R10*1), Z22 + MOVQ 72(R9), R11 + VMOVDQU64 (R11)(R10*1), Z23 + MOVQ 96(R9), R11 + VMOVDQU64 (R11)(R10*1), Z24 + MOVQ 120(R9), R11 + VMOVDQU64 (R11)(R10*1), Z25 + MOVQ 144(R9), R11 + VMOVDQU64 (R11)(R10*1), Z26 + MOVQ 168(R9), R11 + VMOVDQU64 (R11)(R10*1), Z27 + MOVQ 192(R9), R11 + VMOVDQU64 (R11)(R10*1), Z28 + MOVQ 216(R9), R11 + VMOVDQU64 (R11)(R10*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU64 Z20, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x10_64Xor_loop + VZEROUPPER + +mulGFNI_5x10_64Xor_end: + RET + +// func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10Xor(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y4 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 216(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10Xor_loop + VZEROUPPER + +mulAvxGFNI_5x10Xor_end: + RET + +// func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 115 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_5x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + MOVQ 216(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_5x10Xor_loop + VZEROUPPER + +mulAvxTwo_5x10Xor_end: + RET + +// func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_6x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x1_64_loop + VZEROUPPER + +mulAvxTwo_6x1_64_end: + RET + +// func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulGFNI_6x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z6 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Store 1 outputs + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x1_64_loop + VZEROUPPER + +mulGFNI_6x1_64_end: + RET + +// func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x1_loop + VZEROUPPER + +mulAvxGFNI_6x1_end: + RET + +// func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulGFNI_6x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R9), Z6 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Store 1 outputs + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x1_64Xor_loop + VZEROUPPER + +mulGFNI_6x1_64Xor_end: + RET + +// func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1Xor_loop: + // Load 1 outputs + VMOVDQU (R9), Y6 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x1Xor_loop + VZEROUPPER + +mulAvxGFNI_6x1Xor_end: + RET + +// func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_6x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R10), Y0 + VMOVDQU 32(R10), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_6x1_64Xor_end: + RET + +// func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_6x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x2_64_loop + VZEROUPPER + +mulAvxTwo_6x2_64_end: + RET + +// func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulGFNI_6x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z14 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z13 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z14 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z14 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z14 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z14 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z11, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 2 outputs + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x2_64_loop + VZEROUPPER + +mulGFNI_6x2_64_end: + RET + +// func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x2_loop + VZEROUPPER + +mulAvxGFNI_6x2_end: + RET + +// func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulGFNI_6x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R10), Z12 + VMOVDQU64 (R9), Z13 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z14 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z14 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z14 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z14 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z14 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z11, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 2 outputs + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x2_64Xor_loop + VZEROUPPER + +mulGFNI_6x2_64Xor_end: + RET + +// func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2Xor_loop: + // Load 2 outputs + VMOVDQU (R10), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x2Xor_loop + VZEROUPPER + +mulAvxGFNI_6x2Xor_end: + RET + +// func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_6x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R11), Y0 + VMOVDQU 32(R11), Y1 + VMOVDQU (R10), Y2 + VMOVDQU 32(R10), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_6x2_64Xor_end: + RET + +// func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x3_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_6x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y4, (R10) + VMOVDQU Y5, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x3_64_loop + VZEROUPPER + +mulAvxTwo_6x3_64_end: + RET + +// func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, CX + +mulGFNI_6x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z20 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z21 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z21 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z21 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z21 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z16, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z17, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 3 outputs + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x3_64_loop + VZEROUPPER + +mulGFNI_6x3_64_end: + RET + +// func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x3_loop + VZEROUPPER + +mulAvxGFNI_6x3_end: + RET + +// func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, CX + +mulGFNI_6x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R10), Z18 + VMOVDQU64 (R11), Z19 + VMOVDQU64 (R9), Z20 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z21 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z21 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z21 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z21 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z16, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z17, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 3 outputs + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x3_64Xor_loop + VZEROUPPER + +mulGFNI_6x3_64Xor_end: + RET + +// func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3Xor(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3Xor_loop: + // Load 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x3Xor_loop + VZEROUPPER + +mulAvxGFNI_6x3Xor_end: + RET + +// func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_6x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R11), Y0 + VMOVDQU 32(R11), Y1 + VMOVDQU (R12), Y2 + VMOVDQU 32(R12), Y3 + VMOVDQU (R10), Y4 + VMOVDQU 32(R10), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y4, (R10) + VMOVDQU Y5, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_6x3_64Xor_end: + RET + +// func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x4(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_6x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x4_loop + VZEROUPPER + +mulAvxTwo_6x4_end: + RET + +// func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, CX + +mulGFNI_6x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z27 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z28 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z28 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z28 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z21, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z22, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z23, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 4 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x4_64_loop + VZEROUPPER + +mulGFNI_6x4_64_end: + RET + +// func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4_loop + VZEROUPPER + +mulAvxGFNI_6x4_end: + RET + +// func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, CX + +mulGFNI_6x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R9), Z27 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z28 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z28 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z28 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z21, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z22, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z23, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 4 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x4_64Xor_loop + VZEROUPPER + +mulGFNI_6x4_64Xor_end: + RET + +// func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4Xor(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4Xor_loop: + // Load 4 outputs + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4Xor_loop + VZEROUPPER + +mulAvxGFNI_6x4Xor_end: + RET + +// func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_6x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x4Xor_loop + VZEROUPPER + +mulAvxTwo_6x4Xor_end: + RET + +// func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 70 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_6x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x5_loop + VZEROUPPER + +mulAvxTwo_6x5_end: + RET + +// func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x5_64(SB), $0-88 + // Loading 25 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulGFNI_6x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x5_64_loop + VZEROUPPER + +mulGFNI_6x5_64_end: + RET + +// func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5_loop + VZEROUPPER + +mulAvxGFNI_6x5_end: + RET + +// func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x5_64Xor(SB), $0-88 + // Loading 25 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulGFNI_6x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x5_64Xor_loop + VZEROUPPER + +mulGFNI_6x5_64Xor_end: + RET + +// func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5Xor(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5Xor_loop: + // Load 5 outputs + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5Xor_loop + VZEROUPPER + +mulAvxGFNI_6x5Xor_end: + RET + +// func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 70 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_6x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU (R14), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU (R10), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x5Xor_loop + VZEROUPPER + +mulAvxTwo_6x5Xor_end: + RET + +// func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x6(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_6x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R15) + ADDQ $0x20, R15 + VMOVDQU Y5, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x6_loop + VZEROUPPER + +mulAvxTwo_6x6_end: + RET + +// func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x6_64(SB), $8-88 + // Loading 24 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulGFNI_6x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x6_64_loop + VZEROUPPER + +mulGFNI_6x6_64_end: + RET + +// func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6_loop + VZEROUPPER + +mulAvxGFNI_6x6_end: + RET + +// func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x6_64Xor(SB), $8-88 + // Loading 24 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulGFNI_6x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x6_64Xor_loop + VZEROUPPER + +mulGFNI_6x6_64Xor_end: + RET + +// func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6Xor(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6Xor_loop + VZEROUPPER + +mulAvxGFNI_6x6Xor_end: + RET + +// func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x6Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_6x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU (R14), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU (R15), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU (R10), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R15) + ADDQ $0x20, R15 + VMOVDQU Y5, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x6Xor_loop + VZEROUPPER + +mulAvxTwo_6x6Xor_end: + RET + +// func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x7(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 96 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x7_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_6x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y10 + ADDQ $0x20, AX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R15) + ADDQ $0x20, R15 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_6x7_loop + VZEROUPPER + +mulAvxTwo_6x7_end: + RET + +// func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x7_64(SB), $8-88 + // Loading 23 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_6x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_6x7_64_loop + VZEROUPPER + +mulGFNI_6x7_64_end: + RET + +// func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7_loop + VZEROUPPER + +mulAvxGFNI_6x7_end: + RET + +// func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x7_64Xor(SB), $8-88 + // Loading 23 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_6x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_6x7_64Xor_loop + VZEROUPPER + +mulGFNI_6x7_64Xor_end: + RET + +// func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7Xor(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7Xor_loop + VZEROUPPER + +mulAvxGFNI_6x7Xor_end: + RET + +// func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x7Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 96 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x7Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_6x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU (R13), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU (R14), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU (R15), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU (R9), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y10 + ADDQ $0x20, AX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R15) + ADDQ $0x20, R15 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_6x7Xor_loop + VZEROUPPER + +mulAvxTwo_6x7Xor_end: + RET + +// func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 109 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_6x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x8_loop + VZEROUPPER + +mulAvxTwo_6x8_end: + RET + +// func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x8_64(SB), $0-88 + // Loading 22 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x8_64_loop + VZEROUPPER + +mulGFNI_6x8_64_end: + RET + +// func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8_loop + VZEROUPPER + +mulAvxGFNI_6x8_end: + RET + +// func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x8_64Xor(SB), $0-88 + // Loading 22 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x8_64Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x8_64Xor_loop + VZEROUPPER + +mulGFNI_6x8_64Xor_end: + RET + +// func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8Xor(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8Xor_loop + VZEROUPPER + +mulAvxGFNI_6x8Xor_end: + RET + +// func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 109 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_6x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x8Xor_loop + VZEROUPPER + +mulAvxTwo_6x8Xor_end: + RET + +// func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 122 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_6x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x9_loop + VZEROUPPER + +mulAvxTwo_6x9_end: + RET + +// func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x9_64(SB), $0-88 + // Loading 21 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x9_64_loop + VZEROUPPER + +mulGFNI_6x9_64_end: + RET + +// func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9_loop + VZEROUPPER + +mulAvxGFNI_6x9_end: + RET + +// func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x9_64Xor(SB), $0-88 + // Loading 21 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x9_64Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z21 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 192(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x9_64Xor_loop + VZEROUPPER + +mulGFNI_6x9_64Xor_end: + RET + +// func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9Xor(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9Xor_loop + VZEROUPPER + +mulAvxGFNI_6x9Xor_end: + RET + +// func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 122 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_6x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x9Xor_loop + VZEROUPPER + +mulAvxTwo_6x9Xor_end: + RET + +// func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 135 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_6x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x10_loop + VZEROUPPER + +mulAvxTwo_6x10_end: + RET + +// func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x10_64(SB), $0-88 + // Loading 20 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU64 Z20, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x10_64_loop + VZEROUPPER + +mulGFNI_6x10_64_end: + RET + +// func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10_loop + VZEROUPPER + +mulAvxGFNI_6x10_end: + RET + +// func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x10_64Xor(SB), $0-88 + // Loading 20 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x10_64Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z20 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z21 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 192(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 216(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU64 Z20, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x10_64Xor_loop + VZEROUPPER + +mulGFNI_6x10_64Xor_end: + RET + +// func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10Xor(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y4 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 216(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10Xor_loop + VZEROUPPER + +mulAvxGFNI_6x10Xor_end: + RET + +// func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 135 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_6x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + MOVQ 216(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x10Xor_loop + VZEROUPPER + +mulAvxTwo_6x10Xor_end: + RET + +// func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_7x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x1_64_loop + VZEROUPPER + +mulAvxTwo_7x1_64_end: + RET + +// func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulGFNI_7x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z7 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Store 1 outputs + VMOVDQU64 Z7, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x1_64_loop + VZEROUPPER + +mulGFNI_7x1_64_end: + RET + +// func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x1_loop + VZEROUPPER + +mulAvxGFNI_7x1_end: + RET + +// func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulGFNI_7x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R10), Z7 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Store 1 outputs + VMOVDQU64 Z7, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x1_64Xor_loop + VZEROUPPER + +mulGFNI_7x1_64Xor_end: + RET + +// func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1Xor_loop: + // Load 1 outputs + VMOVDQU (R10), Y7 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x1Xor_loop + VZEROUPPER + +mulAvxGFNI_7x1Xor_end: + RET + +// func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_7x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R11), Y0 + VMOVDQU 32(R11), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_7x1_64Xor_end: + RET + +// func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_7x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x2_64_loop + VZEROUPPER + +mulAvxTwo_7x2_64_end: + RET + +// func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, CX + +mulGFNI_7x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z15 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z16 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z16 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z16 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z16 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z13, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 2 outputs + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x2_64_loop + VZEROUPPER + +mulGFNI_7x2_64_end: + RET + +// func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x2_loop + VZEROUPPER + +mulAvxGFNI_7x2_end: + RET + +// func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, CX + +mulGFNI_7x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R11), Z14 + VMOVDQU64 (R10), Z15 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z16 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z16 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z16 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z16 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z13, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 2 outputs + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x2_64Xor_loop + VZEROUPPER + +mulGFNI_7x2_64Xor_end: + RET + +// func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2Xor(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2Xor_loop: + // Load 2 outputs + VMOVDQU (R12), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x2Xor_loop + VZEROUPPER + +mulAvxGFNI_7x2Xor_end: + RET + +// func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_7x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R12), Y0 + VMOVDQU 32(R12), Y1 + VMOVDQU (R11), Y2 + VMOVDQU 32(R11), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_7x2_64Xor_end: + RET + +// func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x3_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 94 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_7x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y4, (R11) + VMOVDQU Y5, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x3_64_loop + VZEROUPPER + +mulAvxTwo_7x3_64_end: + RET + +// func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, CX + +mulGFNI_7x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z23 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z24 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z24 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 3 outputs + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x3_64_loop + VZEROUPPER + +mulGFNI_7x3_64_end: + RET + +// func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x3_loop + VZEROUPPER + +mulAvxGFNI_7x3_end: + RET + +// func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, CX + +mulGFNI_7x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R11), Z21 + VMOVDQU64 (R12), Z22 + VMOVDQU64 (R10), Z23 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z24 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z24 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 3 outputs + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x3_64Xor_loop + VZEROUPPER + +mulGFNI_7x3_64Xor_end: + RET + +// func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3Xor(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3Xor_loop: + // Load 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x3Xor_loop + VZEROUPPER + +mulAvxGFNI_7x3Xor_end: + RET + +// func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 94 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_7x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R12), Y0 + VMOVDQU 32(R12), Y1 + VMOVDQU (R13), Y2 + VMOVDQU 32(R13), Y3 + VMOVDQU (R11), Y4 + VMOVDQU 32(R11), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y4, (R11) + VMOVDQU Y5, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_7x3_64Xor_end: + RET + +// func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x4(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_7x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x4_loop + VZEROUPPER + +mulAvxTwo_7x4_end: + RET + +// func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x4_64(SB), $0-88 + // Loading 26 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulGFNI_7x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x4_64_loop + VZEROUPPER + +mulGFNI_7x4_64_end: + RET + +// func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4_loop + VZEROUPPER + +mulAvxGFNI_7x4_end: + RET + +// func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x4_64Xor(SB), $0-88 + // Loading 26 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulGFNI_7x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x4_64Xor_loop + VZEROUPPER + +mulGFNI_7x4_64Xor_end: + RET + +// func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4Xor(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4Xor_loop: + // Load 4 outputs + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4Xor_loop + VZEROUPPER + +mulAvxGFNI_7x4Xor_end: + RET + +// func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_7x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU (R14), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x4Xor_loop + VZEROUPPER + +mulAvxTwo_7x4Xor_end: + RET + +// func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x5(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 80 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_7x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R15) + ADDQ $0x20, R15 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x5_loop + VZEROUPPER + +mulAvxTwo_7x5_end: + RET + +// func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x5_64(SB), $8-88 + // Loading 25 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulGFNI_7x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x5_64_loop + VZEROUPPER + +mulGFNI_7x5_64_end: + RET + +// func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5_loop + VZEROUPPER + +mulAvxGFNI_7x5_end: + RET + +// func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x5_64Xor(SB), $8-88 + // Loading 25 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulGFNI_7x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x5_64Xor_loop + VZEROUPPER + +mulGFNI_7x5_64Xor_end: + RET + +// func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5Xor(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5Xor_loop + VZEROUPPER + +mulAvxGFNI_7x5Xor_end: + RET + +// func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x5Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 80 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_7x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU (R14), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU (R15), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R15) + ADDQ $0x20, R15 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x5Xor_loop + VZEROUPPER + +mulAvxTwo_7x5Xor_end: + RET + +// func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x6(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 95 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x6_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_7x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y9 + ADDQ $0x20, AX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R15) + ADDQ $0x20, R15 + VMOVDQU Y5, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_7x6_loop + VZEROUPPER + +mulAvxTwo_7x6_end: + RET + +// func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x6_64(SB), $8-88 + // Loading 24 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64_loop + VZEROUPPER + +mulGFNI_7x6_64_end: + RET + +// func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_7x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_7x6_loop + VZEROUPPER + +mulAvxGFNI_7x6_end: + RET + +// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 + // Loading 24 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64Xor_loop + VZEROUPPER + +mulGFNI_7x6_64Xor_end: + RET + +// func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6Xor(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_7x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_7x6Xor_loop + VZEROUPPER + +mulAvxGFNI_7x6Xor_end: + RET + +// func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x6Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 95 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x6Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_7x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU (R14), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU (R15), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU (R10), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y9 + ADDQ $0x20, AX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R15) + ADDQ $0x20, R15 + VMOVDQU Y5, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_7x6Xor_loop + VZEROUPPER + +mulAvxTwo_7x6Xor_end: + RET + +// func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x7(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_7x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x7_loop + VZEROUPPER + +mulAvxTwo_7x7_end: + RET + +// func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x7_64(SB), $0-88 + // Loading 23 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x7_64_loop + VZEROUPPER + +mulGFNI_7x7_64_end: + RET + +// func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7_loop + VZEROUPPER + +mulAvxGFNI_7x7_end: + RET + +// func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x7_64Xor(SB), $0-88 + // Loading 23 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x7_64Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x7_64Xor_loop + VZEROUPPER + +mulGFNI_7x7_64Xor_end: + RET + +// func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7Xor(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7Xor_loop + VZEROUPPER + +mulAvxGFNI_7x7Xor_end: + RET + +// func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_7x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x7Xor_loop + VZEROUPPER + +mulAvxTwo_7x7Xor_end: + RET + +// func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 125 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_7x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x8_loop + VZEROUPPER + +mulAvxTwo_7x8_end: + RET + +// func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x8_64(SB), $0-88 + // Loading 22 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x8_64_loop + VZEROUPPER + +mulGFNI_7x8_64_end: + RET + +// func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8_loop + VZEROUPPER + +mulAvxGFNI_7x8_end: + RET + +// func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x8_64Xor(SB), $0-88 + // Loading 22 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x8_64Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x8_64Xor_loop + VZEROUPPER + +mulGFNI_7x8_64Xor_end: + RET + +// func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8Xor(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8Xor_loop + VZEROUPPER + +mulAvxGFNI_7x8Xor_end: + RET + +// func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 125 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_7x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x8Xor_loop + VZEROUPPER + +mulAvxTwo_7x8Xor_end: + RET + +// func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 140 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_7x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x9_loop + VZEROUPPER + +mulAvxTwo_7x9_end: + RET + +// func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x9_64(SB), $0-88 + // Loading 21 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x9_64_loop + VZEROUPPER + +mulGFNI_7x9_64_end: + RET + +// func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9_loop + VZEROUPPER + +mulAvxGFNI_7x9_end: + RET + +// func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x9_64Xor(SB), $0-88 + // Loading 21 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x9_64Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z21 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 192(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x9_64Xor_loop + VZEROUPPER + +mulGFNI_7x9_64Xor_end: + RET + +// func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9Xor(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9Xor_loop + VZEROUPPER + +mulAvxGFNI_7x9Xor_end: + RET + +// func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 140 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_7x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x9Xor_loop + VZEROUPPER + +mulAvxTwo_7x9Xor_end: + RET + +// func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 155 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_7x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x10_loop + VZEROUPPER + +mulAvxTwo_7x10_end: + RET + +// func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x10_64(SB), $0-88 + // Loading 20 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU64 Z20, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x10_64_loop + VZEROUPPER + +mulGFNI_7x10_64_end: + RET + +// func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10_loop + VZEROUPPER + +mulAvxGFNI_7x10_end: + RET + +// func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x10_64Xor(SB), $0-88 + // Loading 20 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x10_64Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z20 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z21 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 192(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 216(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU64 Z20, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x10_64Xor_loop + VZEROUPPER + +mulGFNI_7x10_64Xor_end: + RET + +// func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10Xor(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y4 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 216(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10Xor_loop + VZEROUPPER + +mulAvxGFNI_7x10Xor_end: + RET + +// func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 155 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_7x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + MOVQ 216(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x10Xor_loop + VZEROUPPER + +mulAvxTwo_7x10Xor_end: + RET + +// func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_8x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x1_64_loop + VZEROUPPER + +mulAvxTwo_8x1_64_end: + RET + +// func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64_loop + VZEROUPPER + +mulGFNI_8x1_64_end: + RET + +// func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvxGFNI_8x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x1_loop + VZEROUPPER + +mulAvxGFNI_8x1_end: + RET + +// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R11), Z8 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64Xor_loop + VZEROUPPER + +mulGFNI_8x1_64Xor_end: + RET + +// func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvxGFNI_8x1Xor_loop: + // Load 1 outputs + VMOVDQU (R11), Y8 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x1Xor_loop + VZEROUPPER + +mulAvxGFNI_8x1Xor_end: + RET + +// func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_8x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R12), Y0 + VMOVDQU 32(R12), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_8x1_64Xor_end: + RET + +// func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 73 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_8x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x2_64_loop + VZEROUPPER + +mulAvxTwo_8x2_64_end: + RET + +// func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, CX + +mulGFNI_8x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z17 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z18 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z18 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z18 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z15, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 2 outputs + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x2_64_loop + VZEROUPPER + +mulGFNI_8x2_64_end: + RET + +// func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x2_loop + VZEROUPPER + +mulAvxGFNI_8x2_end: + RET + +// func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, CX + +mulGFNI_8x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R12), Z16 + VMOVDQU64 (R11), Z17 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z18 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z18 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z18 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z15, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 2 outputs + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x2_64Xor_loop + VZEROUPPER + +mulGFNI_8x2_64Xor_end: + RET + +// func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2Xor(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2Xor_loop: + // Load 2 outputs + VMOVDQU (R13), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x2Xor_loop + VZEROUPPER + +mulAvxGFNI_8x2Xor_end: + RET + +// func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 73 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_8x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R13), Y0 + VMOVDQU 32(R13), Y1 + VMOVDQU (R12), Y2 + VMOVDQU 32(R12), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_8x2_64Xor_end: + RET + +// func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x3_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 106 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_8x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y4, (R12) + VMOVDQU Y5, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x3_64_loop + VZEROUPPER + +mulAvxTwo_8x3_64_end: + RET + +// func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, CX + +mulGFNI_8x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z26 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z27 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z27 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z27 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z27 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z27 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z27 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z22, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z23, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 3 outputs + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x3_64_loop + VZEROUPPER + +mulGFNI_8x3_64_end: + RET + +// func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x3_loop + VZEROUPPER + +mulAvxGFNI_8x3_end: + RET + +// func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, CX + +mulGFNI_8x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R12), Z24 + VMOVDQU64 (R13), Z25 + VMOVDQU64 (R11), Z26 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z27 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z27 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z27 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z27 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z27 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z27 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z22, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z23, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 3 outputs + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x3_64Xor_loop + VZEROUPPER + +mulGFNI_8x3_64Xor_end: + RET + +// func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3Xor(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3Xor_loop: + // Load 3 outputs + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x3Xor_loop + VZEROUPPER + +mulAvxGFNI_8x3Xor_end: + RET + +// func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 106 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_8x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R13), Y0 + VMOVDQU 32(R13), Y1 + VMOVDQU (R14), Y2 + VMOVDQU 32(R14), Y3 + VMOVDQU (R12), Y4 + VMOVDQU 32(R12), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y4, (R12) + VMOVDQU Y5, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_8x3_64Xor_end: + RET + +// func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x4(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 73 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_8x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R15) + ADDQ $0x20, R15 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x4_loop + VZEROUPPER + +mulAvxTwo_8x4_end: + RET + +// func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x4_64(SB), $8-88 + // Loading 26 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulGFNI_8x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x4_64_loop + VZEROUPPER + +mulGFNI_8x4_64_end: + RET + +// func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4_loop + VZEROUPPER + +mulAvxGFNI_8x4_end: + RET + +// func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x4_64Xor(SB), $8-88 + // Loading 26 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulGFNI_8x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x4_64Xor_loop + VZEROUPPER + +mulGFNI_8x4_64Xor_end: + RET + +// func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4Xor(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4Xor_loop + VZEROUPPER + +mulAvxGFNI_8x4Xor_end: + RET + +// func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x4Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 73 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_8x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU (R14), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU (R15), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R15) + ADDQ $0x20, R15 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x4Xor_loop + VZEROUPPER + +mulAvxTwo_8x4Xor_end: + RET + +// func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x5(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x5_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_8x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y8 + ADDQ $0x20, AX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R15) + ADDQ $0x20, R15 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_8x5_loop + VZEROUPPER + +mulAvxTwo_8x5_end: + RET + +// func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x5_64(SB), $8-88 + // Loading 25 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_8x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_8x5_64_loop + VZEROUPPER + +mulGFNI_8x5_64_end: + RET + +// func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5_loop + VZEROUPPER + +mulAvxGFNI_8x5_end: + RET + +// func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x5_64Xor(SB), $8-88 + // Loading 25 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_8x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_8x5_64Xor_loop + VZEROUPPER + +mulGFNI_8x5_64Xor_end: + RET + +// func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5Xor(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5Xor_loop + VZEROUPPER + +mulAvxGFNI_8x5Xor_end: + RET + +// func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x5Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x5Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_8x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU (R14), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU (R15), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y8 + ADDQ $0x20, AX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R15) + ADDQ $0x20, R15 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_8x5Xor_loop + VZEROUPPER + +mulAvxTwo_8x5Xor_end: + RET + +// func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 107 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_8x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x6_loop + VZEROUPPER + +mulAvxTwo_8x6_end: + RET + +// func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x6_64(SB), $0-88 + // Loading 24 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x6_64_loop + VZEROUPPER + +mulGFNI_8x6_64_end: + RET + +// func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6_loop + VZEROUPPER + +mulAvxGFNI_8x6_end: + RET + +// func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x6_64Xor(SB), $0-88 + // Loading 24 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x6_64Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x6_64Xor_loop + VZEROUPPER + +mulGFNI_8x6_64Xor_end: + RET + +// func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6Xor(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6Xor_loop + VZEROUPPER + +mulAvxGFNI_8x6Xor_end: + RET + +// func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 107 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_8x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x6Xor_loop + VZEROUPPER + +mulAvxTwo_8x6Xor_end: + RET + +// func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 124 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_8x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x7_loop + VZEROUPPER + +mulAvxTwo_8x7_end: + RET + +// func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x7_64(SB), $0-88 + // Loading 23 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x7_64_loop + VZEROUPPER + +mulGFNI_8x7_64_end: + RET + +// func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7_loop + VZEROUPPER + +mulAvxGFNI_8x7_end: + RET + +// func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x7_64Xor(SB), $0-88 + // Loading 23 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x7_64Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x7_64Xor_loop + VZEROUPPER + +mulGFNI_8x7_64Xor_end: + RET + +// func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7Xor(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7Xor_loop + VZEROUPPER + +mulAvxGFNI_8x7Xor_end: + RET + +// func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 124 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_8x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x7Xor_loop + VZEROUPPER + +mulAvxTwo_8x7Xor_end: + RET + +// func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 141 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_8x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x8_loop + VZEROUPPER + +mulAvxTwo_8x8_end: + RET + +// func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x8_64(SB), $0-88 + // Loading 22 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x8_64_loop + VZEROUPPER + +mulGFNI_8x8_64_end: + RET + +// func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8_loop + VZEROUPPER + +mulAvxGFNI_8x8_end: + RET + +// func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x8_64Xor(SB), $0-88 + // Loading 22 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x8_64Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x8_64Xor_loop + VZEROUPPER + +mulGFNI_8x8_64Xor_end: + RET + +// func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8Xor(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8Xor_loop + VZEROUPPER + +mulAvxGFNI_8x8Xor_end: + RET + +// func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 141 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_8x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x8Xor_loop + VZEROUPPER + +mulAvxTwo_8x8Xor_end: + RET + +// func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x9(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 158 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_8x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x9_loop + VZEROUPPER + +mulAvxTwo_8x9_end: + RET + +// func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x9_64(SB), $0-88 + // Loading 21 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x9_64_loop + VZEROUPPER + +mulGFNI_8x9_64_end: + RET + +// func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9_loop + VZEROUPPER + +mulAvxGFNI_8x9_end: + RET + +// func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x9_64Xor(SB), $0-88 + // Loading 21 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x9_64Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z21 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 192(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x9_64Xor_loop + VZEROUPPER + +mulGFNI_8x9_64Xor_end: + RET + +// func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9Xor(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9Xor_loop + VZEROUPPER + +mulAvxGFNI_8x9Xor_end: + RET + +// func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 158 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_8x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x9Xor_loop + VZEROUPPER + +mulAvxTwo_8x9Xor_end: + RET + +// func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 175 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_8x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x10_loop + VZEROUPPER + +mulAvxTwo_8x10_end: + RET + +// func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x10_64(SB), $0-88 + // Loading 20 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU64 Z20, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x10_64_loop + VZEROUPPER + +mulGFNI_8x10_64_end: + RET + +// func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10_loop + VZEROUPPER + +mulAvxGFNI_8x10_end: + RET + +// func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x10_64Xor(SB), $0-88 + // Loading 20 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x10_64Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z20 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z21 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 192(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 216(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU64 Z20, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x10_64Xor_loop + VZEROUPPER + +mulGFNI_8x10_64Xor_end: + RET + +// func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10Xor(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y4 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 216(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10Xor_loop + VZEROUPPER + +mulAvxGFNI_8x10Xor_end: + RET + +// func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 175 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_8x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + MOVQ 216(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x10Xor_loop + VZEROUPPER + +mulAvxTwo_8x10Xor_end: + RET + +// func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_9x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x1_64_loop + VZEROUPPER + +mulAvxTwo_9x1_64_end: + RET + +// func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulGFNI_9x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z9 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Store 1 outputs + VMOVDQU64 Z9, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x1_64_loop + VZEROUPPER + +mulGFNI_9x1_64_end: + RET + +// func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x1_loop + VZEROUPPER + +mulAvxGFNI_9x1_end: + RET + +// func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulGFNI_9x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R12), Z9 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Store 1 outputs + VMOVDQU64 Z9, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x1_64Xor_loop + VZEROUPPER + +mulGFNI_9x1_64Xor_end: + RET + +// func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1Xor_loop: + // Load 1 outputs + VMOVDQU (R12), Y9 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x1Xor_loop + VZEROUPPER + +mulAvxGFNI_9x1Xor_end: + RET + +// func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_9x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R13), Y0 + VMOVDQU 32(R13), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_9x1_64Xor_end: + RET + +// func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x2_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 81 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_9x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x2_64_loop + VZEROUPPER + +mulAvxTwo_9x2_64_end: + RET + +// func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, CX + +mulGFNI_9x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z19 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z20 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z20 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z20 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z20 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z20 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 2 outputs + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x2_64_loop + VZEROUPPER + +mulGFNI_9x2_64_end: + RET + +// func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x2_loop + VZEROUPPER + +mulAvxGFNI_9x2_end: + RET + +// func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, CX + +mulGFNI_9x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R13), Z18 + VMOVDQU64 (R12), Z19 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z20 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z20 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z20 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z20 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z20 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 2 outputs + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x2_64Xor_loop + VZEROUPPER + +mulGFNI_9x2_64Xor_end: + RET + +// func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2Xor(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2Xor_loop: + // Load 2 outputs + VMOVDQU (R14), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x2Xor_loop + VZEROUPPER + +mulAvxGFNI_9x2Xor_end: + RET + +// func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 81 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_9x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 + VMOVDQU (R13), Y2 + VMOVDQU 32(R13), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_9x2_64Xor_end: + RET + +// func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x3_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 118 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x3_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_9x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU 32(R12), Y13 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x3_64_loop + VZEROUPPER + +mulAvxTwo_9x3_64_end: + RET + +// func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64_loop + VZEROUPPER + +mulGFNI_9x3_64_end: + RET + +// func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvxGFNI_9x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x3_loop + VZEROUPPER + +mulAvxGFNI_9x3_end: + RET + +// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64Xor_loop + VZEROUPPER + +mulGFNI_9x3_64Xor_end: + RET + +// func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3Xor(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvxGFNI_9x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x3Xor_loop + VZEROUPPER + +mulAvxGFNI_9x3Xor_end: + RET + +// func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x3_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 118 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_9x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 + VMOVDQU (R15), Y2 + VMOVDQU 32(R15), Y3 + VMOVDQU (R13), Y4 + VMOVDQU 32(R13), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU 32(R12), Y13 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_9x3_64Xor_end: + RET + +// func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x4(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 81 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x4_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_9x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y7 + ADDQ $0x20, AX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R15) + ADDQ $0x20, R15 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_9x4_loop + VZEROUPPER + +mulAvxTwo_9x4_end: + RET + +// func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x4_64(SB), $8-88 + // Loading 26 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_9x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_9x4_64_loop + VZEROUPPER + +mulGFNI_9x4_64_end: + RET + +// func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4_loop + VZEROUPPER + +mulAvxGFNI_9x4_end: + RET + +// func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x4_64Xor(SB), $8-88 + // Loading 26 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_9x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_9x4_64Xor_loop + VZEROUPPER + +mulGFNI_9x4_64Xor_end: + RET + +// func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4Xor(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4Xor_loop + VZEROUPPER + +mulAvxGFNI_9x4Xor_end: + RET + +// func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x4Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 81 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x4Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_9x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU (R14), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU (R15), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y7 + ADDQ $0x20, AX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R15) + ADDQ $0x20, R15 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_9x4Xor_loop + VZEROUPPER + +mulAvxTwo_9x4Xor_end: + RET + +// func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x5(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 100 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_9x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y8 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y8 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x5_loop + VZEROUPPER + +mulAvxTwo_9x5_end: + RET + +// func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x5_64(SB), $0-88 + // Loading 25 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x5_64_loop + VZEROUPPER + +mulGFNI_9x5_64_end: + RET + +// func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5_loop + VZEROUPPER + +mulAvxGFNI_9x5_end: + RET + +// func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x5_64Xor(SB), $0-88 + // Loading 25 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x5_64Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x5_64Xor_loop + VZEROUPPER + +mulGFNI_9x5_64Xor_end: + RET + +// func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5Xor(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5Xor_loop + VZEROUPPER + +mulAvxGFNI_9x5Xor_end: + RET + +// func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 100 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_9x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y8 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y8 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x5Xor_loop + VZEROUPPER + +mulAvxTwo_9x5Xor_end: + RET + +// func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x6(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 119 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_9x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y9 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x6_loop + VZEROUPPER + +mulAvxTwo_9x6_end: + RET + +// func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x6_64(SB), $0-88 + // Loading 24 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x6_64_loop + VZEROUPPER + +mulGFNI_9x6_64_end: + RET + +// func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6_loop + VZEROUPPER + +mulAvxGFNI_9x6_end: + RET + +// func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x6_64Xor(SB), $0-88 + // Loading 24 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x6_64Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x6_64Xor_loop + VZEROUPPER + +mulGFNI_9x6_64Xor_end: + RET + +// func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6Xor(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6Xor_loop + VZEROUPPER + +mulAvxGFNI_9x6Xor_end: + RET + +// func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 119 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_9x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y9 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x6Xor_loop + VZEROUPPER + +mulAvxTwo_9x6Xor_end: + RET + +// func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x7(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 138 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_9x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y10 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x7_loop + VZEROUPPER + +mulAvxTwo_9x7_end: + RET + +// func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x7_64(SB), $0-88 + // Loading 23 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x7_64_loop + VZEROUPPER + +mulGFNI_9x7_64_end: + RET + +// func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7_loop + VZEROUPPER + +mulAvxGFNI_9x7_end: + RET + +// func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x7_64Xor(SB), $0-88 + // Loading 23 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x7_64Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x7_64Xor_loop + VZEROUPPER + +mulGFNI_9x7_64Xor_end: + RET + +// func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7Xor(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7Xor_loop + VZEROUPPER + +mulAvxGFNI_9x7Xor_end: + RET + +// func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 138 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_9x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y10 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x7Xor_loop + VZEROUPPER + +mulAvxTwo_9x7Xor_end: + RET + +// func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x8(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 157 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_9x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x8_loop + VZEROUPPER + +mulAvxTwo_9x8_end: + RET + +// func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x8_64(SB), $0-88 + // Loading 22 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x8_64_loop + VZEROUPPER + +mulGFNI_9x8_64_end: + RET + +// func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8_loop + VZEROUPPER + +mulAvxGFNI_9x8_end: + RET + +// func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x8_64Xor(SB), $0-88 + // Loading 22 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x8_64Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x8_64Xor_loop + VZEROUPPER + +mulGFNI_9x8_64Xor_end: + RET + +// func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8Xor(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8Xor_loop + VZEROUPPER + +mulAvxGFNI_9x8Xor_end: + RET + +// func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 157 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_9x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x8Xor_loop + VZEROUPPER + +mulAvxTwo_9x8Xor_end: + RET + +// func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 176 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_9x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y12 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4608(CX), Y10 + VMOVDQU 4640(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 4672(CX), Y10 + VMOVDQU 4704(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 4736(CX), Y10 + VMOVDQU 4768(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 4800(CX), Y10 + VMOVDQU 4832(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 4864(CX), Y10 + VMOVDQU 4896(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 4928(CX), Y10 + VMOVDQU 4960(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 4992(CX), Y10 + VMOVDQU 5024(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 5056(CX), Y10 + VMOVDQU 5088(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 5120(CX), Y10 + VMOVDQU 5152(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x9_loop + VZEROUPPER + +mulAvxTwo_9x9_end: + RET + +// func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x9_64(SB), $0-88 + // Loading 21 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x9_64_loop + VZEROUPPER + +mulGFNI_9x9_64_end: + RET + +// func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9_loop + VZEROUPPER + +mulAvxGFNI_9x9_end: + RET + +// func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x9_64Xor(SB), $0-88 + // Loading 21 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x9_64Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z21 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 192(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x9_64Xor_loop + VZEROUPPER + +mulGFNI_9x9_64Xor_end: + RET + +// func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9Xor(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9Xor_loop + VZEROUPPER + +mulAvxGFNI_9x9Xor_end: + RET + +// func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 176 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_9x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y12 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4608(CX), Y10 + VMOVDQU 4640(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 4672(CX), Y10 + VMOVDQU 4704(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 4736(CX), Y10 + VMOVDQU 4768(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 4800(CX), Y10 + VMOVDQU 4832(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 4864(CX), Y10 + VMOVDQU 4896(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 4928(CX), Y10 + VMOVDQU 4960(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 4992(CX), Y10 + VMOVDQU 5024(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 5056(CX), Y10 + VMOVDQU 5088(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 5120(CX), Y10 + VMOVDQU 5152(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x9Xor_loop + VZEROUPPER + +mulAvxTwo_9x9Xor_end: + RET + +// func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 195 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_9x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y13 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5120(CX), Y11 + VMOVDQU 5152(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 5184(CX), Y11 + VMOVDQU 5216(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 5248(CX), Y11 + VMOVDQU 5280(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 5312(CX), Y11 + VMOVDQU 5344(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 5376(CX), Y11 + VMOVDQU 5408(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 5440(CX), Y11 + VMOVDQU 5472(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 5504(CX), Y11 + VMOVDQU 5536(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 5568(CX), Y11 + VMOVDQU 5600(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 5632(CX), Y11 + VMOVDQU 5664(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 5696(CX), Y11 + VMOVDQU 5728(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x10_loop + VZEROUPPER + +mulAvxTwo_9x10_end: + RET + +// func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x10_64(SB), $0-88 + // Loading 20 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU64 Z20, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x10_64_loop + VZEROUPPER + +mulGFNI_9x10_64_end: + RET + +// func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10_loop + VZEROUPPER + +mulAvxGFNI_9x10_end: + RET + +// func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x10_64Xor(SB), $0-88 + // Loading 20 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x10_64Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z20 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z21 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 192(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 216(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU64 Z20, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x10_64Xor_loop + VZEROUPPER + +mulGFNI_9x10_64Xor_end: + RET + +// func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10Xor(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y4 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 216(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10Xor_loop + VZEROUPPER + +mulAvxGFNI_9x10Xor_end: + RET + +// func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 195 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_9x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + MOVQ 216(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y13 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5120(CX), Y11 + VMOVDQU 5152(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 5184(CX), Y11 + VMOVDQU 5216(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 5248(CX), Y11 + VMOVDQU 5280(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 5312(CX), Y11 + VMOVDQU 5344(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 5376(CX), Y11 + VMOVDQU 5408(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 5440(CX), Y11 + VMOVDQU 5472(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 5504(CX), Y11 + VMOVDQU 5536(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 5568(CX), Y11 + VMOVDQU 5600(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 5632(CX), Y11 + VMOVDQU 5664(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 5696(CX), Y11 + VMOVDQU 5728(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x10Xor_loop + VZEROUPPER + +mulAvxTwo_9x10Xor_end: + RET + +// func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x1_64(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_10x1_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R14 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_10x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU (R13), Y6 + VMOVDQU 32(R13), Y5 + ADDQ $0x40, R13 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_10x1_64_loop + VZEROUPPER + +mulAvxTwo_10x1_64_end: + RET + +// func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulGFNI_10x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z11 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z10 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z11 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z11 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z11 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z11 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z11 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z11 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z11 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (R12), Z11 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z8, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Store 1 outputs + VMOVDQU64 Z10, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x1_64_loop + VZEROUPPER + +mulGFNI_10x1_64_end: + RET + +// func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x1_loop + VZEROUPPER + +mulAvxGFNI_10x1_end: + RET + +// func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulGFNI_10x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R13), Z10 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z11 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z11 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z11 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z11 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z11 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z11 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z11 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z11 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (R12), Z11 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z8, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Store 1 outputs + VMOVDQU64 Z10, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x1_64Xor_loop + VZEROUPPER + +mulGFNI_10x1_64Xor_end: + RET + +// func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1Xor_loop: + // Load 1 outputs + VMOVDQU (R13), Y10 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x1Xor_loop + VZEROUPPER + +mulAvxGFNI_10x1Xor_end: + RET + +// func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_10x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R14 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_10x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU (R13), Y6 + VMOVDQU 32(R13), Y5 + ADDQ $0x40, R13 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) + + // Store 1 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_10x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_10x1_64Xor_end: + RET + +// func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x2_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_10x2_64_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_10x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y9 + VMOVDQU 32(R13), Y11 + ADDQ $0x40, R13 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R15) + VMOVDQU Y1, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_10x2_64_loop + VZEROUPPER + +mulAvxTwo_10x2_64_end: + RET + +// func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, CX + +mulGFNI_10x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z22 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z22, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z22, Z21 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z22 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z22 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z22 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z7, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z22 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z22 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z22 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z22 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (R12), Z22 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z16, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU64 (CX), Z22 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z19, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Store 2 outputs + VMOVDQU64 Z20, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z21, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x2_64_loop + VZEROUPPER + +mulGFNI_10x2_64_end: + RET + +// func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x2_loop + VZEROUPPER + +mulAvxGFNI_10x2_end: + RET + +// func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, CX + +mulGFNI_10x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R14), Z20 + VMOVDQU64 (R13), Z21 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z22 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z22 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z22 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z22 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z7, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z22 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z22 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z22 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z22 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (R12), Z22 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z16, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU64 (CX), Z22 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z19, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Store 2 outputs + VMOVDQU64 Z20, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z21, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x2_64Xor_loop + VZEROUPPER + +mulGFNI_10x2_64Xor_end: + RET + +// func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2Xor(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2Xor_loop: + // Load 2 outputs + VMOVDQU (R15), Y12 + VMOVDQU (R14), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x2Xor_loop + VZEROUPPER + +mulAvxGFNI_10x2Xor_end: + RET + +// func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x2_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_10x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_10x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R15), Y0 + VMOVDQU 32(R15), Y1 + VMOVDQU (R14), Y2 + VMOVDQU 32(R14), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y9 + VMOVDQU 32(R13), Y11 + ADDQ $0x40, R13 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) + + // Store 2 outputs + VMOVDQU Y0, (R15) + VMOVDQU Y1, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_10x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_10x2_64Xor_end: + RET + +// func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x3_64(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_10x3_64_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulAvxTwo_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU 32(R12), Y13 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y13 + ADDQ $0x40, AX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_10x3_64_loop + VZEROUPPER + +mulAvxTwo_10x3_64_end: + RET + +// func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x3_64(SB), $8-88 + // Loading 27 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64_loop + VZEROUPPER + +mulGFNI_10x3_64_end: + RET + +// func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_10x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_10x3_loop + VZEROUPPER + +mulAvxGFNI_10x3_end: + RET + +// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 + // Loading 27 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R13), Z29 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64Xor_loop + VZEROUPPER + +mulGFNI_10x3_64Xor_end: + RET + +// func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3Xor(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_10x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_10x3Xor_loop + VZEROUPPER + +mulAvxGFNI_10x3Xor_end: + RET + +// func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x3_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_10x3_64Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulAvxTwo_10x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 + VMOVDQU (R15), Y2 + VMOVDQU 32(R15), Y3 + VMOVDQU (R13), Y4 + VMOVDQU 32(R13), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU 32(R12), Y13 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y13 + ADDQ $0x40, AX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) + + // Store 3 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_10x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_10x3_64Xor_end: + RET + +// func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x4(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x4_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y7 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y7 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2304(CX), Y5 + VMOVDQU 2336(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 2368(CX), Y5 + VMOVDQU 2400(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 2432(CX), Y5 + VMOVDQU 2464(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 2496(CX), Y5 + VMOVDQU 2528(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x4_loop + VZEROUPPER + +mulAvxTwo_10x4_end: + RET + +// func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x4_64(SB), $8-88 + // Loading 26 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x4_64_loop + VZEROUPPER + +mulGFNI_10x4_64_end: + RET + +// func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4_loop + VZEROUPPER + +mulAvxGFNI_10x4_end: + RET + +// func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x4_64Xor(SB), $8-88 + // Loading 26 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x4_64Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x4_64Xor_loop + VZEROUPPER + +mulGFNI_10x4_64Xor_end: + RET + +// func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4Xor(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4Xor_loop + VZEROUPPER + +mulAvxGFNI_10x4Xor_end: + RET + +// func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_10x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y7 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y7 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2304(CX), Y5 + VMOVDQU 2336(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y0) + VMOVDQU 2368(CX), Y5 + VMOVDQU 2400(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y1) + VMOVDQU 2432(CX), Y5 + VMOVDQU 2464(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU 2496(CX), Y5 + VMOVDQU 2528(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x4Xor_loop + VZEROUPPER + +mulAvxTwo_10x4Xor_end: + RET + +// func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x5_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y8 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y8 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y8 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2880(CX), Y6 + VMOVDQU 2912(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2944(CX), Y6 + VMOVDQU 2976(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 3008(CX), Y6 + VMOVDQU 3040(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 3072(CX), Y6 + VMOVDQU 3104(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 3136(CX), Y6 + VMOVDQU 3168(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x5_loop + VZEROUPPER + +mulAvxTwo_10x5_end: + RET + +// func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x5_64(SB), $8-88 + // Loading 25 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x5_64_loop + VZEROUPPER + +mulGFNI_10x5_64_end: + RET + +// func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5_loop + VZEROUPPER + +mulAvxGFNI_10x5_end: + RET + +// func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x5_64Xor(SB), $8-88 + // Loading 25 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x5_64Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x5_64Xor_loop + VZEROUPPER + +mulGFNI_10x5_64Xor_end: + RET + +// func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5Xor(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5Xor_loop + VZEROUPPER + +mulAvxGFNI_10x5Xor_end: + RET + +// func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_10x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y8 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y8 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y8 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2880(CX), Y6 + VMOVDQU 2912(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y0) + VMOVDQU 2944(CX), Y6 + VMOVDQU 2976(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y1) + VMOVDQU 3008(CX), Y6 + VMOVDQU 3040(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y2) + VMOVDQU 3072(CX), Y6 + VMOVDQU 3104(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y3) + VMOVDQU 3136(CX), Y6 + VMOVDQU 3168(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x5Xor_loop + VZEROUPPER + +mulAvxTwo_10x5Xor_end: + RET + +// func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x6_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y9 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y9 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3456(CX), Y7 + VMOVDQU 3488(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 3520(CX), Y7 + VMOVDQU 3552(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 3584(CX), Y7 + VMOVDQU 3616(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 3648(CX), Y7 + VMOVDQU 3680(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 3712(CX), Y7 + VMOVDQU 3744(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3776(CX), Y7 + VMOVDQU 3808(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x6_loop + VZEROUPPER + +mulAvxTwo_10x6_end: + RET + +// func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x6_64(SB), $8-88 + // Loading 24 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x6_64_loop + VZEROUPPER + +mulGFNI_10x6_64_end: + RET + +// func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6_loop + VZEROUPPER + +mulAvxGFNI_10x6_end: + RET + +// func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x6_64Xor(SB), $8-88 + // Loading 24 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x6_64Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x6_64Xor_loop + VZEROUPPER + +mulGFNI_10x6_64Xor_end: + RET + +// func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6Xor(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6Xor_loop + VZEROUPPER + +mulAvxGFNI_10x6Xor_end: + RET + +// func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_10x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y9 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y9 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3456(CX), Y7 + VMOVDQU 3488(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y0) + VMOVDQU 3520(CX), Y7 + VMOVDQU 3552(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y1) + VMOVDQU 3584(CX), Y7 + VMOVDQU 3616(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y2) + VMOVDQU 3648(CX), Y7 + VMOVDQU 3680(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + VMOVDQU 3712(CX), Y7 + VMOVDQU 3744(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU 3776(CX), Y7 + VMOVDQU 3808(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x6Xor_loop + VZEROUPPER + +mulAvxTwo_10x6Xor_end: + RET + +// func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x7_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y10 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y10 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 4032(CX), Y8 + VMOVDQU 4064(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 4096(CX), Y8 + VMOVDQU 4128(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 4160(CX), Y8 + VMOVDQU 4192(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 4224(CX), Y8 + VMOVDQU 4256(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 4288(CX), Y8 + VMOVDQU 4320(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 4352(CX), Y8 + VMOVDQU 4384(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 4416(CX), Y8 + VMOVDQU 4448(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x7_loop + VZEROUPPER + +mulAvxTwo_10x7_end: + RET + +// func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x7_64(SB), $8-88 + // Loading 23 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x7_64_loop + VZEROUPPER + +mulGFNI_10x7_64_end: + RET + +// func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7_loop + VZEROUPPER + +mulAvxGFNI_10x7_end: + RET + +// func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x7_64Xor(SB), $8-88 + // Loading 23 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x7_64Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x7_64Xor_loop + VZEROUPPER + +mulGFNI_10x7_64Xor_end: + RET + +// func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7Xor(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7Xor_loop + VZEROUPPER + +mulAvxGFNI_10x7Xor_end: + RET + +// func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_10x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y10 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y10 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 4032(CX), Y8 + VMOVDQU 4064(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y0) + VMOVDQU 4096(CX), Y8 + VMOVDQU 4128(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y1) + VMOVDQU 4160(CX), Y8 + VMOVDQU 4192(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y2) + VMOVDQU 4224(CX), Y8 + VMOVDQU 4256(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y3) + VMOVDQU 4288(CX), Y8 + VMOVDQU 4320(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y4) + VMOVDQU 4352(CX), Y8 + VMOVDQU 4384(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y5) + VMOVDQU 4416(CX), Y8 + VMOVDQU 4448(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x7Xor_loop + VZEROUPPER + +mulAvxTwo_10x7Xor_end: + RET + +// func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x8(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x8_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y11 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4608(CX), Y9 + VMOVDQU 4640(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 4672(CX), Y9 + VMOVDQU 4704(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 4736(CX), Y9 + VMOVDQU 4768(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 4800(CX), Y9 + VMOVDQU 4832(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 4864(CX), Y9 + VMOVDQU 4896(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 4928(CX), Y9 + VMOVDQU 4960(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 4992(CX), Y9 + VMOVDQU 5024(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 5056(CX), Y9 + VMOVDQU 5088(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x8_loop + VZEROUPPER + +mulAvxTwo_10x8_end: + RET + +// func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x8_64(SB), $8-88 + // Loading 22 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x8_64_loop + VZEROUPPER + +mulGFNI_10x8_64_end: + RET + +// func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8_loop + VZEROUPPER + +mulAvxGFNI_10x8_end: + RET + +// func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x8_64Xor(SB), $8-88 + // Loading 22 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x8_64Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x8_64Xor_loop + VZEROUPPER + +mulGFNI_10x8_64Xor_end: + RET + +// func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8Xor(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8Xor_loop + VZEROUPPER + +mulAvxGFNI_10x8Xor_end: + RET + +// func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_10x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y11 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4608(CX), Y9 + VMOVDQU 4640(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y0) + VMOVDQU 4672(CX), Y9 + VMOVDQU 4704(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y1) + VMOVDQU 4736(CX), Y9 + VMOVDQU 4768(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y2) + VMOVDQU 4800(CX), Y9 + VMOVDQU 4832(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y3) + VMOVDQU 4864(CX), Y9 + VMOVDQU 4896(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU 4928(CX), Y9 + VMOVDQU 4960(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + VMOVDQU 4992(CX), Y9 + VMOVDQU 5024(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y6) + VMOVDQU 5056(CX), Y9 + VMOVDQU 5088(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x8Xor_loop + VZEROUPPER + +mulAvxTwo_10x8Xor_end: + RET + +// func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x9_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y0 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y1 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y2 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y3 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y4 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y5 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y6 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y7 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y12 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y12 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4608(CX), Y10 + VMOVDQU 4640(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 4672(CX), Y10 + VMOVDQU 4704(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 4736(CX), Y10 + VMOVDQU 4768(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 4800(CX), Y10 + VMOVDQU 4832(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 4864(CX), Y10 + VMOVDQU 4896(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 4928(CX), Y10 + VMOVDQU 4960(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 4992(CX), Y10 + VMOVDQU 5024(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 5056(CX), Y10 + VMOVDQU 5088(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 5120(CX), Y10 + VMOVDQU 5152(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 5184(CX), Y10 + VMOVDQU 5216(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 5248(CX), Y10 + VMOVDQU 5280(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 5312(CX), Y10 + VMOVDQU 5344(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 5376(CX), Y10 + VMOVDQU 5408(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 5440(CX), Y10 + VMOVDQU 5472(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 5504(CX), Y10 + VMOVDQU 5536(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 5568(CX), Y10 + VMOVDQU 5600(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 5632(CX), Y10 + VMOVDQU 5664(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 5696(CX), Y10 + VMOVDQU 5728(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y8, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x9_loop + VZEROUPPER + +mulAvxTwo_10x9_end: + RET + +// func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x9_64(SB), $8-88 + // Loading 21 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x9_64_loop + VZEROUPPER + +mulGFNI_10x9_64_end: + RET + +// func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9_loop + VZEROUPPER + +mulAvxGFNI_10x9_end: + RET + +// func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x9_64Xor(SB), $8-88 + // Loading 21 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x9_64Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z21 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 192(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x9_64Xor_loop + VZEROUPPER + +mulGFNI_10x9_64Xor_end: + RET + +// func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9Xor(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9Xor_loop + VZEROUPPER + +mulAvxGFNI_10x9Xor_end: + RET + +// func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_10x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y12 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y12 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4608(CX), Y10 + VMOVDQU 4640(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 4672(CX), Y10 + VMOVDQU 4704(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 4736(CX), Y10 + VMOVDQU 4768(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 4800(CX), Y10 + VMOVDQU 4832(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 4864(CX), Y10 + VMOVDQU 4896(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 4928(CX), Y10 + VMOVDQU 4960(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 4992(CX), Y10 + VMOVDQU 5024(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 5056(CX), Y10 + VMOVDQU 5088(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 5120(CX), Y10 + VMOVDQU 5152(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 5184(CX), Y10 + VMOVDQU 5216(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y0) + VMOVDQU 5248(CX), Y10 + VMOVDQU 5280(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y1) + VMOVDQU 5312(CX), Y10 + VMOVDQU 5344(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y2) + VMOVDQU 5376(CX), Y10 + VMOVDQU 5408(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y3) + VMOVDQU 5440(CX), Y10 + VMOVDQU 5472(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y4) + VMOVDQU 5504(CX), Y10 + VMOVDQU 5536(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y5) + VMOVDQU 5568(CX), Y10 + VMOVDQU 5600(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y6) + VMOVDQU 5632(CX), Y10 + VMOVDQU 5664(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y7) + VMOVDQU 5696(CX), Y10 + VMOVDQU 5728(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y8, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x9Xor_loop + VZEROUPPER + +mulAvxTwo_10x9Xor_end: + RET + +// func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x10_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y13 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y13 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5120(CX), Y11 + VMOVDQU 5152(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 5184(CX), Y11 + VMOVDQU 5216(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 5248(CX), Y11 + VMOVDQU 5280(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 5312(CX), Y11 + VMOVDQU 5344(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 5376(CX), Y11 + VMOVDQU 5408(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 5440(CX), Y11 + VMOVDQU 5472(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 5504(CX), Y11 + VMOVDQU 5536(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 5568(CX), Y11 + VMOVDQU 5600(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 5632(CX), Y11 + VMOVDQU 5664(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 5696(CX), Y11 + VMOVDQU 5728(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5760(CX), Y11 + VMOVDQU 5792(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 5824(CX), Y11 + VMOVDQU 5856(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 5888(CX), Y11 + VMOVDQU 5920(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 5952(CX), Y11 + VMOVDQU 5984(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 6016(CX), Y11 + VMOVDQU 6048(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 6080(CX), Y11 + VMOVDQU 6112(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 6144(CX), Y11 + VMOVDQU 6176(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 6208(CX), Y11 + VMOVDQU 6240(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 6272(CX), Y11 + VMOVDQU 6304(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 6336(CX), Y11 + VMOVDQU 6368(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y9, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x10_loop + VZEROUPPER + +mulAvxTwo_10x10_end: + RET + +// func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x10_64(SB), $8-88 + // Loading 20 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU64 Z20, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x10_64_loop + VZEROUPPER + +mulGFNI_10x10_64_end: + RET + +// func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10_loop + VZEROUPPER + +mulAvxGFNI_10x10_end: + RET + +// func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x10_64Xor(SB), $8-88 + // Loading 20 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x10_64Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z20 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z21 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 192(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 216(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU64 Z20, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x10_64Xor_loop + VZEROUPPER + +mulGFNI_10x10_64Xor_end: + RET + +// func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10Xor(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y4 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 216(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10Xor_loop + VZEROUPPER + +mulAvxGFNI_10x10Xor_end: + RET + +// func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_10x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + MOVQ 216(R14), BP + VMOVDQU (BP)(R15*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y13 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y13 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5120(CX), Y11 + VMOVDQU 5152(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 5184(CX), Y11 + VMOVDQU 5216(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 5248(CX), Y11 + VMOVDQU 5280(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 5312(CX), Y11 + VMOVDQU 5344(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 5376(CX), Y11 + VMOVDQU 5408(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 5440(CX), Y11 + VMOVDQU 5472(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 5504(CX), Y11 + VMOVDQU 5536(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 5568(CX), Y11 + VMOVDQU 5600(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 5632(CX), Y11 + VMOVDQU 5664(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 5696(CX), Y11 + VMOVDQU 5728(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5760(CX), Y11 + VMOVDQU 5792(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y0) + VMOVDQU 5824(CX), Y11 + VMOVDQU 5856(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y1) + VMOVDQU 5888(CX), Y11 + VMOVDQU 5920(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y2) + VMOVDQU 5952(CX), Y11 + VMOVDQU 5984(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + VMOVDQU 6016(CX), Y11 + VMOVDQU 6048(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VMOVDQU 6080(CX), Y11 + VMOVDQU 6112(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + VMOVDQU 6144(CX), Y11 + VMOVDQU 6176(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU 6208(CX), Y11 + VMOVDQU 6240(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + VMOVDQU 6272(CX), Y11 + VMOVDQU 6304(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU 6336(CX), Y11 + VMOVDQU 6368(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y9, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x10Xor_loop + VZEROUPPER + +mulAvxTwo_10x10Xor_end: + RET + +// func ifftDIT2_avx2(x []byte, y []byte, table *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT2_avx2(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 64(AX), Y1 + VBROADCASTI128 16(AX), Y2 + VBROADCASTI128 80(AX), Y3 + VBROADCASTI128 32(AX), Y4 + VBROADCASTI128 96(AX), Y5 + VBROADCASTI128 48(AX), Y6 + VBROADCASTI128 112(AX), Y7 + MOVQ x_len+8(FP), AX + MOVQ x_base+0(FP), CX + MOVQ y_base+24(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X8 + VPBROADCASTB X8, Y8 + +loop: + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y12 + VPXOR Y11, Y9, Y11 + VPXOR Y12, Y10, Y12 + VMOVDQU Y11, (DX) + VMOVDQU Y12, 32(DX) + VPSRLQ $0x04, Y11, Y13 + VPAND Y8, Y11, Y11 + VPAND Y8, Y13, Y13 + VPSHUFB Y11, Y0, Y14 + VPSHUFB Y11, Y1, Y11 + VPSHUFB Y13, Y2, Y15 + VPSHUFB Y13, Y3, Y13 + VPXOR Y14, Y15, Y14 + VPXOR Y11, Y13, Y11 + VPAND Y12, Y8, Y13 + VPSRLQ $0x04, Y12, Y12 + VPAND Y8, Y12, Y12 + VPSHUFB Y13, Y4, Y15 + VPSHUFB Y13, Y5, Y13 + VPXOR Y14, Y15, Y14 + VPXOR Y11, Y13, Y11 + VPSHUFB Y12, Y6, Y15 + VPSHUFB Y12, Y7, Y13 + XOR3WAY( $0x00, Y14, Y15, Y9) + XOR3WAY( $0x00, Y11, Y13, Y10) + VMOVDQU Y9, (CX) + VMOVDQU Y10, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, DX + SUBQ $0x40, AX + JNZ loop + VZEROUPPER + RET + +// func fftDIT2_avx2(x []byte, y []byte, table *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT2_avx2(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 64(AX), Y1 + VBROADCASTI128 16(AX), Y2 + VBROADCASTI128 80(AX), Y3 + VBROADCASTI128 32(AX), Y4 + VBROADCASTI128 96(AX), Y5 + VBROADCASTI128 48(AX), Y6 + VBROADCASTI128 112(AX), Y7 + MOVQ x_len+8(FP), AX + MOVQ x_base+0(FP), CX + MOVQ y_base+24(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X8 + VPBROADCASTB X8, Y8 + +loop: + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y12 + VPSRLQ $0x04, Y11, Y13 + VPAND Y8, Y11, Y11 + VPAND Y8, Y13, Y13 + VPSHUFB Y11, Y0, Y14 + VPSHUFB Y11, Y1, Y11 + VPSHUFB Y13, Y2, Y15 + VPSHUFB Y13, Y3, Y13 + VPXOR Y14, Y15, Y14 + VPXOR Y11, Y13, Y11 + VPAND Y12, Y8, Y13 + VPSRLQ $0x04, Y12, Y12 + VPAND Y8, Y12, Y12 + VPSHUFB Y13, Y4, Y15 + VPSHUFB Y13, Y5, Y13 + VPXOR Y14, Y15, Y14 + VPXOR Y11, Y13, Y11 + VPSHUFB Y12, Y6, Y15 + VPSHUFB Y12, Y7, Y13 + XOR3WAY( $0x00, Y14, Y15, Y9) + XOR3WAY( $0x00, Y11, Y13, Y10) + VMOVDQU Y9, (CX) + VMOVDQU Y10, 32(CX) + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y12 + VPXOR Y11, Y9, Y11 + VPXOR Y12, Y10, Y12 + VMOVDQU Y11, (DX) + VMOVDQU Y12, 32(DX) + ADDQ $0x40, CX + ADDQ $0x40, DX + SUBQ $0x40, AX + JNZ loop + VZEROUPPER + RET + +// func mulgf16_avx2(x []byte, y []byte, table *[128]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulgf16_avx2(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 64(AX), Y1 + VBROADCASTI128 16(AX), Y2 + VBROADCASTI128 80(AX), Y3 + VBROADCASTI128 32(AX), Y4 + VBROADCASTI128 96(AX), Y5 + VBROADCASTI128 48(AX), Y6 + VBROADCASTI128 112(AX), Y7 + MOVQ x_len+8(FP), AX + MOVQ x_base+0(FP), CX + MOVQ y_base+24(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X8 + VPBROADCASTB X8, Y8 + +loop: + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y10 + VPSRLQ $0x04, Y9, Y11 + VPAND Y8, Y9, Y9 + VPAND Y8, Y11, Y11 + VPSHUFB Y9, Y0, Y12 + VPSHUFB Y9, Y1, Y9 + VPSHUFB Y11, Y2, Y13 + VPSHUFB Y11, Y3, Y11 + VPXOR Y12, Y13, Y12 + VPXOR Y9, Y11, Y9 + VPAND Y10, Y8, Y11 + VPSRLQ $0x04, Y10, Y10 + VPAND Y8, Y10, Y10 + VPSHUFB Y11, Y4, Y13 + VPSHUFB Y11, Y5, Y11 + VPXOR Y12, Y13, Y12 + VPXOR Y9, Y11, Y9 + VPSHUFB Y10, Y6, Y13 + VPSHUFB Y10, Y7, Y11 + VPXOR Y12, Y13, Y12 + VPXOR Y9, Y11, Y9 + VMOVDQU Y12, (CX) + VMOVDQU Y9, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, DX + SUBQ $0x40, AX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx512_0(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + VBROADCASTI128 (DX), Y1 + VBROADCASTI128 64(DX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(DX), Y1 + VBROADCASTI128 80(DX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(DX), Y1 + VBROADCASTI128 96(DX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(DX), Y1 + VBROADCASTI128 112(DX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z24 + VMOVAPS Z0, Z25 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z26 + VMOVAPS Z0, Z27 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z28 + VMOVAPS Z0, Z29 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z30 + VMOVAPS Z0, Z31 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), DX + MOVQ 8(DX), BX + XORQ SI, SI + MOVQ (DX)(SI*1), DI + ADDQ AX, SI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (DX)(SI*1), R9 + ADDQ AX, SI + MOVQ (DX)(SI*1), AX + +loop: + VMOVDQU (DI), Y1 + VMOVDQU 32(DI), Y2 + VMOVDQU (R8), Y3 + VMOVDQU 32(R8), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VPSRLQ $0x04, Y3, Y6 + VPAND Y0, Y3, Y5 + VPAND Y0, Y6, Y6 + VPSHUFB Y5, Y24, Y7 + VPSHUFB Y5, Y25, Y5 + VPSHUFB Y6, Y26, Y8 + VPSHUFB Y6, Y27, Y6 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y6, Y5 + VPAND Y4, Y0, Y6 + VPSRLQ $0x04, Y4, Y8 + VPAND Y0, Y8, Y8 + VPSHUFB Y6, Y28, Y9 + VPSHUFB Y6, Y29, Y6 + VPXOR Y7, Y9, Y7 + VPXOR Y5, Y6, Y5 + VPSHUFB Y8, Y30, Y9 + VPSHUFB Y8, Y31, Y6 + VPTERNLOGD $0x96, Y7, Y9, Y1 + VPTERNLOGD $0x96, Y5, Y6, Y2 + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y5 + VPTERNLOGD $0x96, Y9, Y10, Y6 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y3 + VPTERNLOGD $0x96, Y9, Y10, Y4 + VMOVDQU Y1, (DI) + VMOVDQU Y2, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y5, (R9) + VMOVDQU Y6, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, BX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx512_0(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + VBROADCASTI128 (DX), Y1 + VBROADCASTI128 64(DX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(DX), Y1 + VBROADCASTI128 80(DX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(DX), Y1 + VBROADCASTI128 96(DX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(DX), Y1 + VBROADCASTI128 112(DX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z24 + VMOVAPS Z0, Z25 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z26 + VMOVAPS Z0, Z27 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z28 + VMOVAPS Z0, Z29 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z30 + VMOVAPS Z0, Z31 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), DX + MOVQ 8(DX), BX + XORQ SI, SI + MOVQ (DX)(SI*1), DI + ADDQ AX, SI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (DX)(SI*1), R9 + ADDQ AX, SI + MOVQ (DX)(SI*1), AX + +loop: + VMOVDQU (DI), Y1 + VMOVDQU 32(DI), Y2 + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (R8), Y3 + VMOVDQU 32(R8), Y4 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y3 + VPTERNLOGD $0x96, Y9, Y10, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y3, Y10 + VPAND Y0, Y3, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y24, Y11 + VPSHUFB Y9, Y25, Y9 + VPSHUFB Y10, Y26, Y12 + VPSHUFB Y10, Y27, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y4, Y0, Y10 + VPSRLQ $0x04, Y4, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y28, Y13 + VPSHUFB Y10, Y29, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y30, Y13 + VPSHUFB Y12, Y31, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (DI) + VMOVDQU Y2, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + VPSRLQ $0x04, Y7, Y2 + VPAND Y0, Y7, Y1 + VPAND Y0, Y2, Y2 + VBROADCASTI128 (CX), Y3 + VBROADCASTI128 64(CX), Y4 + VPSHUFB Y1, Y3, Y3 + VPSHUFB Y1, Y4, Y1 + VBROADCASTI128 16(CX), Y4 + VBROADCASTI128 80(CX), Y9 + VPSHUFB Y2, Y4, Y4 + VPSHUFB Y2, Y9, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y1, Y2, Y1 + VPAND Y8, Y0, Y2 + VPSRLQ $0x04, Y8, Y4 + VPAND Y0, Y4, Y4 + VBROADCASTI128 32(CX), Y9 + VBROADCASTI128 96(CX), Y10 + VPSHUFB Y2, Y9, Y9 + VPSHUFB Y2, Y10, Y2 + VPXOR Y3, Y9, Y3 + VPXOR Y1, Y2, Y1 + VBROADCASTI128 48(CX), Y9 + VBROADCASTI128 112(CX), Y2 + VPSHUFB Y4, Y9, Y9 + VPSHUFB Y4, Y2, Y2 + VPTERNLOGD $0x96, Y3, Y9, Y5 + VPTERNLOGD $0x96, Y1, Y2, Y6 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R9) + VMOVDQU Y6, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, BX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx512_1(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), CX + VBROADCASTI128 (CX), Y1 + VBROADCASTI128 64(CX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(CX), Y1 + VBROADCASTI128 80(CX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(CX), Y1 + VBROADCASTI128 96(CX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(CX), Y1 + VBROADCASTI128 112(CX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z24 + VMOVAPS Z0, Z25 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z26 + VMOVAPS Z0, Z27 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z28 + VMOVAPS Z0, Z29 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z30 + VMOVAPS Z0, Z31 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y24, Y11 + VPSHUFB Y9, Y25, Y9 + VPSHUFB Y10, Y26, Y12 + VPSHUFB Y10, Y27, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y28, Y13 + VPSHUFB Y10, Y29, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y30, Y13 + VPSHUFB Y12, Y31, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y5 + VPTERNLOGD $0x96, Y9, Y10, Y6 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y3 + VPTERNLOGD $0x96, Y9, Y10, Y4 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx512_1(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + VBROADCASTI128 (CX), Y1 + VBROADCASTI128 64(CX), Y0 + VMOVAPS Z1, Z24 + VMOVAPS Z0, Z25 + VBROADCASTI128 16(CX), Y1 + VBROADCASTI128 80(CX), Y0 + VMOVAPS Z1, Z26 + VMOVAPS Z0, Z27 + VBROADCASTI128 32(CX), Y1 + VBROADCASTI128 96(CX), Y0 + VMOVAPS Z1, Z28 + VMOVAPS Z0, Z29 + VBROADCASTI128 48(CX), Y1 + VBROADCASTI128 112(CX), Y0 + VMOVAPS Z1, Z30 + VMOVAPS Z0, Z31 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y3, Y10 + VPAND Y0, Y3, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y4, Y0, Y10 + VPSRLQ $0x04, Y4, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VPSRLQ $0x04, Y7, Y2 + VPAND Y0, Y7, Y1 + VPAND Y0, Y2, Y2 + VPSHUFB Y1, Y24, Y3 + VPSHUFB Y1, Y25, Y1 + VPSHUFB Y2, Y26, Y4 + VPSHUFB Y2, Y27, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y1, Y2, Y1 + VPAND Y8, Y0, Y2 + VPSRLQ $0x04, Y8, Y4 + VPAND Y0, Y4, Y4 + VPSHUFB Y2, Y28, Y9 + VPSHUFB Y2, Y29, Y2 + VPXOR Y3, Y9, Y3 + VPXOR Y1, Y2, Y1 + VPSHUFB Y4, Y30, Y9 + VPSHUFB Y4, Y31, Y2 + VPTERNLOGD $0x96, Y3, Y9, Y5 + VPTERNLOGD $0x96, Y1, Y2, Y6 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx512_2(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), CX + VBROADCASTI128 (CX), Y1 + VBROADCASTI128 64(CX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(CX), Y1 + VBROADCASTI128 80(CX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(CX), Y1 + VBROADCASTI128 96(CX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(CX), Y1 + VBROADCASTI128 112(CX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z24 + VMOVAPS Z0, Z25 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z26 + VMOVAPS Z0, Z27 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z28 + VMOVAPS Z0, Z29 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z30 + VMOVAPS Z0, Z31 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VPSRLQ $0x04, Y3, Y6 + VPAND Y0, Y3, Y5 + VPAND Y0, Y6, Y6 + VPSHUFB Y5, Y24, Y7 + VPSHUFB Y5, Y25, Y5 + VPSHUFB Y6, Y26, Y8 + VPSHUFB Y6, Y27, Y6 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y6, Y5 + VPAND Y4, Y0, Y6 + VPSRLQ $0x04, Y4, Y8 + VPAND Y0, Y8, Y8 + VPSHUFB Y6, Y28, Y9 + VPSHUFB Y6, Y29, Y6 + VPXOR Y7, Y9, Y7 + VPXOR Y5, Y6, Y5 + VPSHUFB Y8, Y30, Y9 + VPSHUFB Y8, Y31, Y6 + VPTERNLOGD $0x96, Y7, Y9, Y1 + VPTERNLOGD $0x96, Y5, Y6, Y2 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y3 + VPTERNLOGD $0x96, Y9, Y10, Y4 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx512_2(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), CX + VBROADCASTI128 (CX), Y1 + VBROADCASTI128 64(CX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(CX), Y1 + VBROADCASTI128 80(CX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(CX), Y1 + VBROADCASTI128 96(CX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(CX), Y1 + VBROADCASTI128 112(CX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z24 + VMOVAPS Z0, Z25 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z26 + VMOVAPS Z0, Z27 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z28 + VMOVAPS Z0, Z29 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z30 + VMOVAPS Z0, Z31 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y3 + VPTERNLOGD $0x96, Y9, Y10, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VPSRLQ $0x04, Y7, Y2 + VPAND Y0, Y7, Y1 + VPAND Y0, Y2, Y2 + VPSHUFB Y1, Y24, Y3 + VPSHUFB Y1, Y25, Y1 + VPSHUFB Y2, Y26, Y4 + VPSHUFB Y2, Y27, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y1, Y2, Y1 + VPAND Y8, Y0, Y2 + VPSRLQ $0x04, Y8, Y4 + VPAND Y0, Y4, Y4 + VPSHUFB Y2, Y28, Y9 + VPSHUFB Y2, Y29, Y2 + VPXOR Y3, Y9, Y3 + VPXOR Y1, Y2, Y1 + VPSHUFB Y4, Y30, Y9 + VPSHUFB Y4, Y31, Y2 + VPTERNLOGD $0x96, Y3, Y9, Y5 + VPTERNLOGD $0x96, Y1, Y2, Y6 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx512_3(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), AX + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y3 + VPTERNLOGD $0x96, Y9, Y10, Y4 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx512_3(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), CX + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VPSRLQ $0x04, Y7, Y2 + VPAND Y0, Y7, Y1 + VPAND Y0, Y2, Y2 + VPSHUFB Y1, Y16, Y3 + VPSHUFB Y1, Y17, Y1 + VPSHUFB Y2, Y18, Y4 + VPSHUFB Y2, Y19, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y1, Y2, Y1 + VPAND Y8, Y0, Y2 + VPSRLQ $0x04, Y8, Y4 + VPAND Y0, Y4, Y4 + VPSHUFB Y2, Y20, Y9 + VPSHUFB Y2, Y21, Y2 + VPXOR Y3, Y9, Y3 + VPXOR Y1, Y2, Y1 + VPSHUFB Y4, Y22, Y9 + VPSHUFB Y4, Y23, Y2 + VPTERNLOGD $0x96, Y3, Y9, Y5 + VPTERNLOGD $0x96, Y1, Y2, Y6 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx512_4(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + VBROADCASTI128 (CX), Y1 + VBROADCASTI128 64(CX), Y0 + VMOVAPS Z1, Z24 + VMOVAPS Z0, Z25 + VBROADCASTI128 16(CX), Y1 + VBROADCASTI128 80(CX), Y0 + VMOVAPS Z1, Z26 + VMOVAPS Z0, Z27 + VBROADCASTI128 32(CX), Y1 + VBROADCASTI128 96(CX), Y0 + VMOVAPS Z1, Z28 + VMOVAPS Z0, Z29 + VBROADCASTI128 48(CX), Y1 + VBROADCASTI128 112(CX), Y0 + VMOVAPS Z1, Z30 + VMOVAPS Z0, Z31 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VPSRLQ $0x04, Y3, Y6 + VPAND Y0, Y3, Y5 + VPAND Y0, Y6, Y6 + VPSHUFB Y5, Y16, Y7 + VPSHUFB Y5, Y17, Y5 + VPSHUFB Y6, Y18, Y8 + VPSHUFB Y6, Y19, Y6 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y6, Y5 + VPAND Y4, Y0, Y6 + VPSRLQ $0x04, Y4, Y8 + VPAND Y0, Y8, Y8 + VPSHUFB Y6, Y20, Y9 + VPSHUFB Y6, Y21, Y6 + VPXOR Y7, Y9, Y7 + VPXOR Y5, Y6, Y5 + VPSHUFB Y8, Y22, Y9 + VPSHUFB Y8, Y23, Y6 + VPTERNLOGD $0x96, Y7, Y9, Y1 + VPTERNLOGD $0x96, Y5, Y6, Y2 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y24, Y11 + VPSHUFB Y9, Y25, Y9 + VPSHUFB Y10, Y26, Y12 + VPSHUFB Y10, Y27, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y28, Y13 + VPSHUFB Y10, Y29, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y30, Y13 + VPSHUFB Y12, Y31, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y5 + VPTERNLOGD $0x96, Y9, Y10, Y6 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx512_4(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), CX + VBROADCASTI128 (CX), Y1 + VBROADCASTI128 64(CX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(CX), Y1 + VBROADCASTI128 80(CX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(CX), Y1 + VBROADCASTI128 96(CX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(CX), Y1 + VBROADCASTI128 112(CX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z24 + VMOVAPS Z0, Z25 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z26 + VMOVAPS Z0, Z27 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z28 + VMOVAPS Z0, Z29 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z30 + VMOVAPS Z0, Z31 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y3 + VPTERNLOGD $0x96, Y9, Y10, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y3, Y10 + VPAND Y0, Y3, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y24, Y11 + VPSHUFB Y9, Y25, Y9 + VPSHUFB Y10, Y26, Y12 + VPSHUFB Y10, Y27, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y4, Y0, Y10 + VPSRLQ $0x04, Y4, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y28, Y13 + VPSHUFB Y10, Y29, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y30, Y13 + VPSHUFB Y12, Y31, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx512_5(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), CX + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y5 + VPTERNLOGD $0x96, Y9, Y10, Y6 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx512_5(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), CX + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y3, Y10 + VPAND Y0, Y3, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y4, Y0, Y10 + VPSRLQ $0x04, Y4, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx512_6(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), CX + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VPSRLQ $0x04, Y3, Y6 + VPAND Y0, Y3, Y5 + VPAND Y0, Y6, Y6 + VPSHUFB Y5, Y16, Y7 + VPSHUFB Y5, Y17, Y5 + VPSHUFB Y6, Y18, Y8 + VPSHUFB Y6, Y19, Y6 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y6, Y5 + VPAND Y4, Y0, Y6 + VPSRLQ $0x04, Y4, Y8 + VPAND Y0, Y8, Y8 + VPSHUFB Y6, Y20, Y9 + VPSHUFB Y6, Y21, Y6 + VPXOR Y7, Y9, Y7 + VPXOR Y5, Y6, Y5 + VPSHUFB Y8, Y22, Y9 + VPSHUFB Y8, Y23, Y6 + VPTERNLOGD $0x96, Y7, Y9, Y1 + VPTERNLOGD $0x96, Y5, Y6, Y2 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx512_6(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), AX + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (DI), Y3 + VMOVDQU 32(DI), Y4 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y3 + VPTERNLOGD $0x96, Y9, Y10, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y3, (DI) + VMOVDQU Y4, 32(DI) + ADDQ $0x40, DI + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·ifftDIT4_avx512_7(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), AX + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + VPXOR Y0, Y2, Y2 + VPXOR Y1, Y3, Y3 + VMOVDQU (R8), Y4 + VMOVDQU 32(R8), Y5 + VMOVDQU (AX), Y6 + VMOVDQU 32(AX), Y7 + VPXOR Y4, Y6, Y6 + VPXOR Y5, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y6, (AX) + VMOVDQU Y7, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·fftDIT4_avx512_7(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), AX + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (R8), Y4 + VMOVDQU 32(R8), Y5 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + VMOVDQU (AX), Y6 + VMOVDQU 32(AX), Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y0, Y2, Y2 + VPXOR Y1, Y3, Y3 + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VPXOR Y4, Y6, Y6 + VPXOR Y5, Y7, Y7 + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y6, (AX) + VMOVDQU Y7, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx2_0(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), BX + MOVQ work_base+0(FP), SI + MOVQ 8(SI), DI + XORQ R8, R8 + MOVQ (SI)(R8*1), R9 + ADDQ BX, R8 + MOVQ (SI)(R8*1), R10 + ADDQ BX, R8 + MOVQ (SI)(R8*1), R11 + ADDQ BX, R8 + MOVQ (SI)(R8*1), BX + +loop: + VMOVDQU (R9), Y1 + VMOVDQU 32(R9), Y2 + VMOVDQU (R10), Y3 + VMOVDQU 32(R10), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VPSRLQ $0x04, Y3, Y6 + VPAND Y0, Y3, Y5 + VPAND Y0, Y6, Y6 + VBROADCASTI128 (AX), Y7 + VBROADCASTI128 64(AX), Y8 + VPSHUFB Y5, Y7, Y7 + VPSHUFB Y5, Y8, Y5 + VBROADCASTI128 16(AX), Y8 + VBROADCASTI128 80(AX), Y9 + VPSHUFB Y6, Y8, Y8 + VPSHUFB Y6, Y9, Y6 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y6, Y5 + VPAND Y4, Y0, Y6 + VPSRLQ $0x04, Y4, Y8 + VPAND Y0, Y8, Y8 + VBROADCASTI128 32(AX), Y9 + VBROADCASTI128 96(AX), Y10 + VPSHUFB Y6, Y9, Y9 + VPSHUFB Y6, Y10, Y6 + VPXOR Y7, Y9, Y7 + VPXOR Y5, Y6, Y5 + VBROADCASTI128 48(AX), Y9 + VBROADCASTI128 112(AX), Y6 + VPSHUFB Y8, Y9, Y9 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y7, Y9, Y1) + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (R11), Y5 + VMOVDQU 32(R11), Y6 + VMOVDQU (BX), Y7 + VMOVDQU 32(BX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y5) + XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (DX), Y11 + VBROADCASTI128 64(DX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(DX), Y12 + VBROADCASTI128 80(DX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(DX), Y13 + VBROADCASTI128 96(DX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(DX), Y13 + VBROADCASTI128 112(DX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (DX), Y11 + VBROADCASTI128 64(DX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(DX), Y12 + VBROADCASTI128 80(DX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(DX), Y13 + VBROADCASTI128 96(DX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(DX), Y13 + VBROADCASTI128 112(DX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y3) + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU Y1, (R9) + VMOVDQU Y2, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y3, (R10) + VMOVDQU Y4, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y5, (R11) + VMOVDQU Y6, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y7, (BX) + VMOVDQU Y8, 32(BX) + ADDQ $0x40, BX + SUBQ $0x40, DI + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx2_0(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), BX + MOVQ work_base+0(FP), SI + MOVQ 8(SI), DI + XORQ R8, R8 + MOVQ (SI)(R8*1), R9 + ADDQ BX, R8 + MOVQ (SI)(R8*1), R10 + ADDQ BX, R8 + MOVQ (SI)(R8*1), R11 + ADDQ BX, R8 + MOVQ (SI)(R8*1), BX + +loop: + VMOVDQU (R9), Y1 + VMOVDQU 32(R9), Y2 + VMOVDQU (R11), Y5 + VMOVDQU 32(R11), Y6 + VMOVDQU (R10), Y3 + VMOVDQU 32(R10), Y4 + VMOVDQU (BX), Y7 + VMOVDQU 32(BX), Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (DX), Y11 + VBROADCASTI128 64(DX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(DX), Y12 + VBROADCASTI128 80(DX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(DX), Y13 + VBROADCASTI128 96(DX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(DX), Y13 + VBROADCASTI128 112(DX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (DX), Y11 + VBROADCASTI128 64(DX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(DX), Y12 + VBROADCASTI128 80(DX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(DX), Y13 + VBROADCASTI128 96(DX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(DX), Y13 + VBROADCASTI128 112(DX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y3) + XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y3, Y10 + VPAND Y0, Y3, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (AX), Y11 + VBROADCASTI128 64(AX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(AX), Y12 + VBROADCASTI128 80(AX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y4, Y0, Y10 + VPSRLQ $0x04, Y4, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(AX), Y13 + VBROADCASTI128 96(AX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(AX), Y13 + VBROADCASTI128 112(AX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (R9) + VMOVDQU Y2, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y3, (R10) + VMOVDQU Y4, 32(R10) + ADDQ $0x40, R10 + VPSRLQ $0x04, Y7, Y2 + VPAND Y0, Y7, Y1 + VPAND Y0, Y2, Y2 + VBROADCASTI128 (CX), Y3 + VBROADCASTI128 64(CX), Y4 + VPSHUFB Y1, Y3, Y3 + VPSHUFB Y1, Y4, Y1 + VBROADCASTI128 16(CX), Y4 + VBROADCASTI128 80(CX), Y9 + VPSHUFB Y2, Y4, Y4 + VPSHUFB Y2, Y9, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y1, Y2, Y1 + VPAND Y8, Y0, Y2 + VPSRLQ $0x04, Y8, Y4 + VPAND Y0, Y4, Y4 + VBROADCASTI128 32(CX), Y9 + VBROADCASTI128 96(CX), Y10 + VPSHUFB Y2, Y9, Y9 + VPSHUFB Y2, Y10, Y2 + VPXOR Y3, Y9, Y3 + VPXOR Y1, Y2, Y1 + VBROADCASTI128 48(CX), Y9 + VBROADCASTI128 112(CX), Y2 + VPSHUFB Y4, Y9, Y9 + VPSHUFB Y4, Y2, Y2 + XOR3WAY( $0x00, Y3, Y9, Y5) + XOR3WAY( $0x00, Y1, Y2, Y6) + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R11) + VMOVDQU Y6, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y7, (BX) + VMOVDQU Y8, 32(BX) + ADDQ $0x40, BX + SUBQ $0x40, DI + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx2_1(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), CX + MOVQ $0x0000000f, DX + MOVQ DX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), DX + MOVQ work_base+0(FP), BX + MOVQ 8(BX), SI + XORQ DI, DI + MOVQ (BX)(DI*1), R8 + ADDQ DX, DI + MOVQ (BX)(DI*1), R9 + ADDQ DX, DI + MOVQ (BX)(DI*1), R10 + ADDQ DX, DI + MOVQ (BX)(DI*1), DX + +loop: + VMOVDQU (R8), Y1 + VMOVDQU 32(R8), Y2 + VMOVDQU (R9), Y3 + VMOVDQU 32(R9), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU (R10), Y5 + VMOVDQU 32(R10), Y6 + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (AX), Y11 + VBROADCASTI128 64(AX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(AX), Y12 + VBROADCASTI128 80(AX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(AX), Y13 + VBROADCASTI128 96(AX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(AX), Y13 + VBROADCASTI128 112(AX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y5) + XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y3) + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU Y1, (R8) + VMOVDQU Y2, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y3, (R9) + VMOVDQU Y4, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y5, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y7, (DX) + VMOVDQU Y8, 32(DX) + ADDQ $0x40, DX + SUBQ $0x40, SI + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx2_1(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + MOVQ $0x0000000f, DX + MOVQ DX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), DX + MOVQ work_base+0(FP), BX + MOVQ 8(BX), SI + XORQ DI, DI + MOVQ (BX)(DI*1), R8 + ADDQ DX, DI + MOVQ (BX)(DI*1), R9 + ADDQ DX, DI + MOVQ (BX)(DI*1), R10 + ADDQ DX, DI + MOVQ (BX)(DI*1), DX + +loop: + VMOVDQU (R8), Y1 + VMOVDQU 32(R8), Y2 + VMOVDQU (R10), Y5 + VMOVDQU 32(R10), Y6 + VMOVDQU (R9), Y3 + VMOVDQU 32(R9), Y4 + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y3, Y10 + VPAND Y0, Y3, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (AX), Y11 + VBROADCASTI128 64(AX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(AX), Y12 + VBROADCASTI128 80(AX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y4, Y0, Y10 + VPSRLQ $0x04, Y4, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(AX), Y13 + VBROADCASTI128 96(AX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(AX), Y13 + VBROADCASTI128 112(AX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (R8) + VMOVDQU Y2, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y3, (R9) + VMOVDQU Y4, 32(R9) + ADDQ $0x40, R9 + VPSRLQ $0x04, Y7, Y2 + VPAND Y0, Y7, Y1 + VPAND Y0, Y2, Y2 + VBROADCASTI128 (CX), Y3 + VBROADCASTI128 64(CX), Y4 + VPSHUFB Y1, Y3, Y3 + VPSHUFB Y1, Y4, Y1 + VBROADCASTI128 16(CX), Y4 + VBROADCASTI128 80(CX), Y9 + VPSHUFB Y2, Y4, Y4 + VPSHUFB Y2, Y9, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y1, Y2, Y1 + VPAND Y8, Y0, Y2 + VPSRLQ $0x04, Y8, Y4 + VPAND Y0, Y4, Y4 + VBROADCASTI128 32(CX), Y9 + VBROADCASTI128 96(CX), Y10 + VPSHUFB Y2, Y9, Y9 + VPSHUFB Y2, Y10, Y2 + VPXOR Y3, Y9, Y3 + VPXOR Y1, Y2, Y1 + VBROADCASTI128 48(CX), Y9 + VBROADCASTI128 112(CX), Y2 + VPSHUFB Y4, Y9, Y9 + VPSHUFB Y4, Y2, Y2 + XOR3WAY( $0x00, Y3, Y9, Y5) + XOR3WAY( $0x00, Y1, Y2, Y6) + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y7, (DX) + VMOVDQU Y8, 32(DX) + ADDQ $0x40, DX + SUBQ $0x40, SI + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx2_2(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), CX + MOVQ $0x0000000f, DX + MOVQ DX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), DX + MOVQ work_base+0(FP), BX + MOVQ 8(BX), SI + XORQ DI, DI + MOVQ (BX)(DI*1), R8 + ADDQ DX, DI + MOVQ (BX)(DI*1), R9 + ADDQ DX, DI + MOVQ (BX)(DI*1), R10 + ADDQ DX, DI + MOVQ (BX)(DI*1), DX + +loop: + VMOVDQU (R8), Y1 + VMOVDQU 32(R8), Y2 + VMOVDQU (R9), Y3 + VMOVDQU 32(R9), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VPSRLQ $0x04, Y3, Y6 + VPAND Y0, Y3, Y5 + VPAND Y0, Y6, Y6 + VBROADCASTI128 (AX), Y7 + VBROADCASTI128 64(AX), Y8 + VPSHUFB Y5, Y7, Y7 + VPSHUFB Y5, Y8, Y5 + VBROADCASTI128 16(AX), Y8 + VBROADCASTI128 80(AX), Y9 + VPSHUFB Y6, Y8, Y8 + VPSHUFB Y6, Y9, Y6 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y6, Y5 + VPAND Y4, Y0, Y6 + VPSRLQ $0x04, Y4, Y8 + VPAND Y0, Y8, Y8 + VBROADCASTI128 32(AX), Y9 + VBROADCASTI128 96(AX), Y10 + VPSHUFB Y6, Y9, Y9 + VPSHUFB Y6, Y10, Y6 + VPXOR Y7, Y9, Y7 + VPXOR Y5, Y6, Y5 + VBROADCASTI128 48(AX), Y9 + VBROADCASTI128 112(AX), Y6 + VPSHUFB Y8, Y9, Y9 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y7, Y9, Y1) + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (R10), Y5 + VMOVDQU 32(R10), Y6 + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y3) + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU Y1, (R8) + VMOVDQU Y2, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y3, (R9) + VMOVDQU Y4, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y5, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y7, (DX) + VMOVDQU Y8, 32(DX) + ADDQ $0x40, DX + SUBQ $0x40, SI + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx2_2(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), CX + MOVQ $0x0000000f, DX + MOVQ DX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), DX + MOVQ work_base+0(FP), BX + MOVQ 8(BX), SI + XORQ DI, DI + MOVQ (BX)(DI*1), R8 + ADDQ DX, DI + MOVQ (BX)(DI*1), R9 + ADDQ DX, DI + MOVQ (BX)(DI*1), R10 + ADDQ DX, DI + MOVQ (BX)(DI*1), DX + +loop: + VMOVDQU (R8), Y1 + VMOVDQU 32(R8), Y2 + VMOVDQU (R10), Y5 + VMOVDQU 32(R10), Y6 + VMOVDQU (R9), Y3 + VMOVDQU 32(R9), Y4 + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y3) + XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (R8) + VMOVDQU Y2, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y3, (R9) + VMOVDQU Y4, 32(R9) + ADDQ $0x40, R9 + VPSRLQ $0x04, Y7, Y2 + VPAND Y0, Y7, Y1 + VPAND Y0, Y2, Y2 + VBROADCASTI128 (AX), Y3 + VBROADCASTI128 64(AX), Y4 + VPSHUFB Y1, Y3, Y3 + VPSHUFB Y1, Y4, Y1 + VBROADCASTI128 16(AX), Y4 + VBROADCASTI128 80(AX), Y9 + VPSHUFB Y2, Y4, Y4 + VPSHUFB Y2, Y9, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y1, Y2, Y1 + VPAND Y8, Y0, Y2 + VPSRLQ $0x04, Y8, Y4 + VPAND Y0, Y4, Y4 + VBROADCASTI128 32(AX), Y9 + VBROADCASTI128 96(AX), Y10 + VPSHUFB Y2, Y9, Y9 + VPSHUFB Y2, Y10, Y2 + VPXOR Y3, Y9, Y3 + VPXOR Y1, Y2, Y1 + VBROADCASTI128 48(AX), Y9 + VBROADCASTI128 112(AX), Y2 + VPSHUFB Y4, Y9, Y9 + VPSHUFB Y4, Y2, Y2 + XOR3WAY( $0x00, Y3, Y9, Y5) + XOR3WAY( $0x00, Y1, Y2, Y6) + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y7, (DX) + VMOVDQU Y8, 32(DX) + ADDQ $0x40, DX + SUBQ $0x40, SI + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx2_3(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), AX + MOVQ $0x0000000f, CX + MOVQ CX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), CX + MOVQ work_base+0(FP), DX + MOVQ 8(DX), BX + XORQ SI, SI + MOVQ (DX)(SI*1), DI + ADDQ CX, SI + MOVQ (DX)(SI*1), R8 + ADDQ CX, SI + MOVQ (DX)(SI*1), R9 + ADDQ CX, SI + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU (DI), Y1 + VMOVDQU 32(DI), Y2 + VMOVDQU (R8), Y3 + VMOVDQU 32(R8), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (AX), Y11 + VBROADCASTI128 64(AX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(AX), Y12 + VBROADCASTI128 80(AX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(AX), Y13 + VBROADCASTI128 96(AX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(AX), Y13 + VBROADCASTI128 112(AX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (AX), Y11 + VBROADCASTI128 64(AX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(AX), Y12 + VBROADCASTI128 80(AX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(AX), Y13 + VBROADCASTI128 96(AX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(AX), Y13 + VBROADCASTI128 112(AX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y3) + XOR3WAY( $0x00, Y9, Y10, Y4) + VMOVDQU Y1, (DI) + VMOVDQU Y2, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y5, (R9) + VMOVDQU Y6, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y7, (CX) + VMOVDQU Y8, 32(CX) + ADDQ $0x40, CX + SUBQ $0x40, BX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx2_3(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), CX + MOVQ $0x0000000f, CX + MOVQ CX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), CX + MOVQ work_base+0(FP), DX + MOVQ 8(DX), BX + XORQ SI, SI + MOVQ (DX)(SI*1), DI + ADDQ CX, SI + MOVQ (DX)(SI*1), R8 + ADDQ CX, SI + MOVQ (DX)(SI*1), R9 + ADDQ CX, SI + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU (DI), Y1 + VMOVDQU 32(DI), Y2 + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (R8), Y3 + VMOVDQU 32(R8), Y4 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (DI) + VMOVDQU Y2, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + VPSRLQ $0x04, Y7, Y2 + VPAND Y0, Y7, Y1 + VPAND Y0, Y2, Y2 + VBROADCASTI128 (AX), Y3 + VBROADCASTI128 64(AX), Y4 + VPSHUFB Y1, Y3, Y3 + VPSHUFB Y1, Y4, Y1 + VBROADCASTI128 16(AX), Y4 + VBROADCASTI128 80(AX), Y9 + VPSHUFB Y2, Y4, Y4 + VPSHUFB Y2, Y9, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y1, Y2, Y1 + VPAND Y8, Y0, Y2 + VPSRLQ $0x04, Y8, Y4 + VPAND Y0, Y4, Y4 + VBROADCASTI128 32(AX), Y9 + VBROADCASTI128 96(AX), Y10 + VPSHUFB Y2, Y9, Y9 + VPSHUFB Y2, Y10, Y2 + VPXOR Y3, Y9, Y3 + VPXOR Y1, Y2, Y1 + VBROADCASTI128 48(AX), Y9 + VBROADCASTI128 112(AX), Y2 + VPSHUFB Y4, Y9, Y9 + VPSHUFB Y4, Y2, Y2 + XOR3WAY( $0x00, Y3, Y9, Y5) + XOR3WAY( $0x00, Y1, Y2, Y6) + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R9) + VMOVDQU Y6, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y7, (CX) + VMOVDQU Y8, 32(CX) + ADDQ $0x40, CX + SUBQ $0x40, BX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx2_4(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + MOVQ $0x0000000f, DX + MOVQ DX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), DX + MOVQ work_base+0(FP), BX + MOVQ 8(BX), SI + XORQ DI, DI + MOVQ (BX)(DI*1), R8 + ADDQ DX, DI + MOVQ (BX)(DI*1), R9 + ADDQ DX, DI + MOVQ (BX)(DI*1), R10 + ADDQ DX, DI + MOVQ (BX)(DI*1), DX + +loop: + VMOVDQU (R8), Y1 + VMOVDQU 32(R8), Y2 + VMOVDQU (R9), Y3 + VMOVDQU 32(R9), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VPSRLQ $0x04, Y3, Y6 + VPAND Y0, Y3, Y5 + VPAND Y0, Y6, Y6 + VBROADCASTI128 (AX), Y7 + VBROADCASTI128 64(AX), Y8 + VPSHUFB Y5, Y7, Y7 + VPSHUFB Y5, Y8, Y5 + VBROADCASTI128 16(AX), Y8 + VBROADCASTI128 80(AX), Y9 + VPSHUFB Y6, Y8, Y8 + VPSHUFB Y6, Y9, Y6 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y6, Y5 + VPAND Y4, Y0, Y6 + VPSRLQ $0x04, Y4, Y8 + VPAND Y0, Y8, Y8 + VBROADCASTI128 32(AX), Y9 + VBROADCASTI128 96(AX), Y10 + VPSHUFB Y6, Y9, Y9 + VPSHUFB Y6, Y10, Y6 + VPXOR Y7, Y9, Y7 + VPXOR Y5, Y6, Y5 + VBROADCASTI128 48(AX), Y9 + VBROADCASTI128 112(AX), Y6 + VPSHUFB Y8, Y9, Y9 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y7, Y9, Y1) + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (R10), Y5 + VMOVDQU 32(R10), Y6 + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y5) + XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VMOVDQU Y1, (R8) + VMOVDQU Y2, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y3, (R9) + VMOVDQU Y4, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y5, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y7, (DX) + VMOVDQU Y8, 32(DX) + ADDQ $0x40, DX + SUBQ $0x40, SI + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx2_4(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), CX + MOVQ $0x0000000f, DX + MOVQ DX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), DX + MOVQ work_base+0(FP), BX + MOVQ 8(BX), SI + XORQ DI, DI + MOVQ (BX)(DI*1), R8 + ADDQ DX, DI + MOVQ (BX)(DI*1), R9 + ADDQ DX, DI + MOVQ (BX)(DI*1), R10 + ADDQ DX, DI + MOVQ (BX)(DI*1), DX + +loop: + VMOVDQU (R8), Y1 + VMOVDQU 32(R8), Y2 + VMOVDQU (R10), Y5 + VMOVDQU 32(R10), Y6 + VMOVDQU (R9), Y3 + VMOVDQU 32(R9), Y4 + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y3) + XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y3, Y10 + VPAND Y0, Y3, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (AX), Y11 + VBROADCASTI128 64(AX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(AX), Y12 + VBROADCASTI128 80(AX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y4, Y0, Y10 + VPSRLQ $0x04, Y4, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(AX), Y13 + VBROADCASTI128 96(AX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(AX), Y13 + VBROADCASTI128 112(AX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (R8) + VMOVDQU Y2, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y3, (R9) + VMOVDQU Y4, 32(R9) + ADDQ $0x40, R9 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R10) + VMOVDQU Y6, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y7, (DX) + VMOVDQU Y8, 32(DX) + ADDQ $0x40, DX + SUBQ $0x40, SI + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx2_5(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), CX + MOVQ $0x0000000f, CX + MOVQ CX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), CX + MOVQ work_base+0(FP), DX + MOVQ 8(DX), BX + XORQ SI, SI + MOVQ (DX)(SI*1), DI + ADDQ CX, SI + MOVQ (DX)(SI*1), R8 + ADDQ CX, SI + MOVQ (DX)(SI*1), R9 + ADDQ CX, SI + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU (DI), Y1 + VMOVDQU 32(DI), Y2 + VMOVDQU (R8), Y3 + VMOVDQU 32(R8), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (AX), Y11 + VBROADCASTI128 64(AX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(AX), Y12 + VBROADCASTI128 80(AX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(AX), Y13 + VBROADCASTI128 96(AX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(AX), Y13 + VBROADCASTI128 112(AX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y5) + XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VMOVDQU Y1, (DI) + VMOVDQU Y2, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y5, (R9) + VMOVDQU Y6, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y7, (CX) + VMOVDQU Y8, 32(CX) + ADDQ $0x40, CX + SUBQ $0x40, BX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx2_5(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), CX + MOVQ $0x0000000f, CX + MOVQ CX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), CX + MOVQ work_base+0(FP), DX + MOVQ 8(DX), BX + XORQ SI, SI + MOVQ (DX)(SI*1), DI + ADDQ CX, SI + MOVQ (DX)(SI*1), R8 + ADDQ CX, SI + MOVQ (DX)(SI*1), R9 + ADDQ CX, SI + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU (DI), Y1 + VMOVDQU 32(DI), Y2 + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (R8), Y3 + VMOVDQU 32(R8), Y4 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPSRLQ $0x04, Y3, Y10 + VPAND Y0, Y3, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (AX), Y11 + VBROADCASTI128 64(AX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(AX), Y12 + VBROADCASTI128 80(AX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y4, Y0, Y10 + VPSRLQ $0x04, Y4, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(AX), Y13 + VBROADCASTI128 96(AX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(AX), Y13 + VBROADCASTI128 112(AX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (DI) + VMOVDQU Y2, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R9) + VMOVDQU Y6, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y7, (CX) + VMOVDQU Y8, 32(CX) + ADDQ $0x40, CX + SUBQ $0x40, BX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx2_6(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), CX + MOVQ $0x0000000f, CX + MOVQ CX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), CX + MOVQ work_base+0(FP), DX + MOVQ 8(DX), BX + XORQ SI, SI + MOVQ (DX)(SI*1), DI + ADDQ CX, SI + MOVQ (DX)(SI*1), R8 + ADDQ CX, SI + MOVQ (DX)(SI*1), R9 + ADDQ CX, SI + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU (DI), Y1 + VMOVDQU 32(DI), Y2 + VMOVDQU (R8), Y3 + VMOVDQU 32(R8), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VPSRLQ $0x04, Y3, Y6 + VPAND Y0, Y3, Y5 + VPAND Y0, Y6, Y6 + VBROADCASTI128 (AX), Y7 + VBROADCASTI128 64(AX), Y8 + VPSHUFB Y5, Y7, Y7 + VPSHUFB Y5, Y8, Y5 + VBROADCASTI128 16(AX), Y8 + VBROADCASTI128 80(AX), Y9 + VPSHUFB Y6, Y8, Y8 + VPSHUFB Y6, Y9, Y6 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y6, Y5 + VPAND Y4, Y0, Y6 + VPSRLQ $0x04, Y4, Y8 + VPAND Y0, Y8, Y8 + VBROADCASTI128 32(AX), Y9 + VBROADCASTI128 96(AX), Y10 + VPSHUFB Y6, Y9, Y9 + VPSHUFB Y6, Y10, Y6 + VPXOR Y7, Y9, Y7 + VPXOR Y5, Y6, Y5 + VBROADCASTI128 48(AX), Y9 + VBROADCASTI128 112(AX), Y6 + VPSHUFB Y8, Y9, Y9 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y7, Y9, Y1) + XOR3WAY( $0x00, Y5, Y6, Y2) + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VMOVDQU Y1, (DI) + VMOVDQU Y2, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y5, (R9) + VMOVDQU Y6, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y7, (CX) + VMOVDQU Y8, 32(CX) + ADDQ $0x40, CX + SUBQ $0x40, BX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx2_6(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), AX + MOVQ $0x0000000f, CX + MOVQ CX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), CX + MOVQ work_base+0(FP), DX + MOVQ 8(DX), BX + XORQ SI, SI + MOVQ (DX)(SI*1), DI + ADDQ CX, SI + MOVQ (DX)(SI*1), R8 + ADDQ CX, SI + MOVQ (DX)(SI*1), R9 + ADDQ CX, SI + MOVQ (DX)(SI*1), CX + +loop: + VMOVDQU (DI), Y1 + VMOVDQU 32(DI), Y2 + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (R8), Y3 + VMOVDQU 32(R8), Y4 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (AX), Y11 + VBROADCASTI128 64(AX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(AX), Y12 + VBROADCASTI128 80(AX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(AX), Y13 + VBROADCASTI128 96(AX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(AX), Y13 + VBROADCASTI128 112(AX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (AX), Y11 + VBROADCASTI128 64(AX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(AX), Y12 + VBROADCASTI128 80(AX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(AX), Y13 + VBROADCASTI128 96(AX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(AX), Y13 + VBROADCASTI128 112(AX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y3) + XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (DI) + VMOVDQU Y2, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R9) + VMOVDQU Y6, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y7, (CX) + VMOVDQU Y8, 32(CX) + ADDQ $0x40, CX + SUBQ $0x40, BX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·ifftDIT4_avx2_7(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), AX + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + VPXOR Y0, Y2, Y2 + VPXOR Y1, Y3, Y3 + VMOVDQU (R8), Y4 + VMOVDQU 32(R8), Y5 + VMOVDQU (AX), Y6 + VMOVDQU 32(AX), Y7 + VPXOR Y4, Y6, Y6 + VPXOR Y5, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y6, (AX) + VMOVDQU Y7, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·fftDIT4_avx2_7(SB), NOSPLIT, $0-56 + // dist must be multiplied by 24 (size of slice header) + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), AX + MOVQ table02+48(FP), AX + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (R8), Y4 + VMOVDQU 32(R8), Y5 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + VMOVDQU (AX), Y6 + VMOVDQU 32(AX), Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y0, Y2, Y2 + VPXOR Y1, Y3, Y3 + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VPXOR Y4, Y6, Y6 + VPXOR Y5, Y7, Y7 + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y6, (AX) + VMOVDQU Y7, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) +// Requires: SSE, SSE2, SSSE3 +TEXT ·ifftDIT2_ssse3(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + MOVUPS (AX), X0 + MOVUPS 64(AX), X1 + MOVUPS 16(AX), X2 + MOVUPS 80(AX), X3 + MOVUPS 32(AX), X4 + MOVUPS 96(AX), X5 + XORPS X6, X6 + MOVQ $0x0000000f, CX + MOVQ CX, X7 + PSHUFB X6, X7 + MOVQ x_len+8(FP), CX + MOVQ x_base+0(FP), DX + MOVQ y_base+24(FP), BX + +loop: + MOVUPS (DX), X6 + MOVUPS 32(DX), X8 + MOVUPS (BX), X9 + MOVUPS 32(BX), X10 + PXOR X6, X9 + PXOR X8, X10 + MOVUPS X9, (BX) + MOVUPS X10, 32(BX) + MOVAPS X9, X11 + PSRLQ $0x04, X11 + MOVAPS X9, X9 + PAND X7, X9 + PAND X7, X11 + MOVUPS X0, X12 + MOVUPS X1, X13 + PSHUFB X9, X12 + PSHUFB X9, X13 + MOVUPS X2, X9 + MOVUPS X3, X14 + PSHUFB X11, X9 + PSHUFB X11, X14 + PXOR X9, X12 + PXOR X14, X13 + MOVAPS X10, X9 + MOVAPS X10, X10 + PAND X7, X9 + PSRLQ $0x04, X10 + PAND X7, X10 + MOVUPS X4, X11 + MOVUPS X5, X14 + PSHUFB X9, X11 + PSHUFB X9, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS 48(AX), X11 + MOVUPS 112(AX), X14 + PSHUFB X10, X11 + PSHUFB X10, X14 + PXOR X11, X12 + PXOR X14, X13 + PXOR X12, X6 + PXOR X13, X8 + MOVUPS X6, (DX) + MOVUPS X8, 32(DX) + MOVUPS 16(DX), X6 + MOVUPS 48(DX), X8 + MOVUPS 16(BX), X9 + MOVUPS 48(BX), X10 + PXOR X6, X9 + PXOR X8, X10 + MOVUPS X9, 16(BX) + MOVUPS X10, 48(BX) + MOVAPS X9, X11 + PSRLQ $0x04, X11 + MOVAPS X9, X9 + PAND X7, X9 + PAND X7, X11 + MOVUPS X0, X12 + MOVUPS X1, X13 + PSHUFB X9, X12 + PSHUFB X9, X13 + MOVUPS X2, X9 + MOVUPS X3, X14 + PSHUFB X11, X9 + PSHUFB X11, X14 + PXOR X9, X12 + PXOR X14, X13 + MOVAPS X10, X9 + MOVAPS X10, X10 + PAND X7, X9 + PSRLQ $0x04, X10 + PAND X7, X10 + MOVUPS X4, X11 + MOVUPS X5, X14 + PSHUFB X9, X11 + PSHUFB X9, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS 48(AX), X11 + MOVUPS 112(AX), X14 + PSHUFB X10, X11 + PSHUFB X10, X14 + PXOR X11, X12 + PXOR X14, X13 + PXOR X12, X6 + PXOR X13, X8 + MOVUPS X6, 16(DX) + MOVUPS X8, 48(DX) + ADDQ $0x40, DX + ADDQ $0x40, BX + SUBQ $0x40, CX + JNZ loop + RET + +// func fftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) +// Requires: SSE, SSE2, SSSE3 +TEXT ·fftDIT2_ssse3(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + MOVUPS (AX), X0 + MOVUPS 64(AX), X1 + MOVUPS 16(AX), X2 + MOVUPS 80(AX), X3 + MOVUPS 32(AX), X4 + MOVUPS 96(AX), X5 + XORPS X6, X6 + MOVQ $0x0000000f, CX + MOVQ CX, X7 + PSHUFB X6, X7 + MOVQ x_len+8(FP), CX + MOVQ x_base+0(FP), DX + MOVQ y_base+24(FP), BX + +loop: + MOVUPS (BX), X9 + MOVUPS 32(BX), X10 + MOVAPS X9, X8 + PSRLQ $0x04, X8 + MOVAPS X9, X6 + PAND X7, X6 + PAND X7, X8 + MOVUPS X0, X11 + MOVUPS X1, X12 + PSHUFB X6, X11 + PSHUFB X6, X12 + MOVUPS X2, X6 + MOVUPS X3, X13 + PSHUFB X8, X6 + PSHUFB X8, X13 + PXOR X6, X11 + PXOR X13, X12 + MOVAPS X10, X6 + MOVAPS X10, X8 + PAND X7, X6 + PSRLQ $0x04, X8 + PAND X7, X8 + MOVUPS X4, X13 + MOVUPS X5, X14 + PSHUFB X6, X13 + PSHUFB X6, X14 + PXOR X13, X11 + PXOR X14, X12 + MOVUPS 48(AX), X13 + MOVUPS 112(AX), X14 + PSHUFB X8, X13 + PSHUFB X8, X14 + PXOR X13, X11 + PXOR X14, X12 + MOVUPS (DX), X6 + MOVUPS 32(DX), X8 + PXOR X11, X6 + PXOR X12, X8 + MOVUPS X6, (DX) + MOVUPS X8, 32(DX) + PXOR X6, X9 + PXOR X8, X10 + MOVUPS X9, (BX) + MOVUPS X10, 32(BX) + MOVUPS 16(BX), X9 + MOVUPS 48(BX), X10 + MOVAPS X9, X8 + PSRLQ $0x04, X8 + MOVAPS X9, X6 + PAND X7, X6 + PAND X7, X8 + MOVUPS X0, X11 + MOVUPS X1, X12 + PSHUFB X6, X11 + PSHUFB X6, X12 + MOVUPS X2, X6 + MOVUPS X3, X13 + PSHUFB X8, X6 + PSHUFB X8, X13 + PXOR X6, X11 + PXOR X13, X12 + MOVAPS X10, X6 + MOVAPS X10, X8 + PAND X7, X6 + PSRLQ $0x04, X8 + PAND X7, X8 + MOVUPS X4, X13 + MOVUPS X5, X14 + PSHUFB X6, X13 + PSHUFB X6, X14 + PXOR X13, X11 + PXOR X14, X12 + MOVUPS 48(AX), X13 + MOVUPS 112(AX), X14 + PSHUFB X8, X13 + PSHUFB X8, X14 + PXOR X13, X11 + PXOR X14, X12 + MOVUPS 16(DX), X6 + MOVUPS 48(DX), X8 + PXOR X11, X6 + PXOR X12, X8 + MOVUPS X6, 16(DX) + MOVUPS X8, 48(DX) + PXOR X6, X9 + PXOR X8, X10 + MOVUPS X9, 16(BX) + MOVUPS X10, 48(BX) + ADDQ $0x40, DX + ADDQ $0x40, BX + SUBQ $0x40, CX + JNZ loop + RET + +// func mulgf16_ssse3(x []byte, y []byte, table *[128]uint8) +// Requires: SSE, SSE2, SSSE3 +TEXT ·mulgf16_ssse3(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + MOVUPS (AX), X0 + MOVUPS 64(AX), X1 + MOVUPS 16(AX), X2 + MOVUPS 80(AX), X3 + MOVUPS 32(AX), X4 + MOVUPS 96(AX), X5 + MOVUPS 48(AX), X6 + MOVUPS 112(AX), X7 + MOVQ x_len+8(FP), AX + MOVQ x_base+0(FP), CX + MOVQ y_base+24(FP), DX + XORPS X8, X8 + MOVQ $0x0000000f, BX + MOVQ BX, X9 + PSHUFB X8, X9 + +loop: + MOVUPS (DX), X8 + MOVUPS 32(DX), X10 + MOVAPS X8, X11 + PSRLQ $0x04, X11 + MOVAPS X8, X8 + PAND X9, X8 + PAND X9, X11 + MOVUPS X0, X12 + MOVUPS X1, X13 + PSHUFB X8, X12 + PSHUFB X8, X13 + MOVUPS X2, X8 + MOVUPS X3, X14 + PSHUFB X11, X8 + PSHUFB X11, X14 + PXOR X8, X12 + PXOR X14, X13 + MOVAPS X10, X8 + MOVAPS X10, X10 + PAND X9, X8 + PSRLQ $0x04, X10 + PAND X9, X10 + MOVUPS X4, X11 + MOVUPS X5, X14 + PSHUFB X8, X11 + PSHUFB X8, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS X6, X11 + MOVUPS X7, X14 + PSHUFB X10, X11 + PSHUFB X10, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS X12, (CX) + MOVUPS X13, 32(CX) + MOVUPS 16(DX), X8 + MOVUPS 48(DX), X10 + MOVAPS X8, X11 + PSRLQ $0x04, X11 + MOVAPS X8, X8 + PAND X9, X8 + PAND X9, X11 + MOVUPS X0, X12 + MOVUPS X1, X13 + PSHUFB X8, X12 + PSHUFB X8, X13 + MOVUPS X2, X8 + MOVUPS X3, X14 + PSHUFB X11, X8 + PSHUFB X11, X14 + PXOR X8, X12 + PXOR X14, X13 + MOVAPS X10, X8 + MOVAPS X10, X10 + PAND X9, X8 + PSRLQ $0x04, X10 + PAND X9, X10 + MOVUPS X4, X11 + MOVUPS X5, X14 + PSHUFB X8, X11 + PSHUFB X8, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS X6, X11 + MOVUPS X7, X14 + PSHUFB X10, X11 + PSHUFB X10, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS X12, 16(CX) + MOVUPS X13, 48(CX) + ADDQ $0x40, CX + ADDQ $0x40, DX + SUBQ $0x40, AX + JNZ loop + RET + +// func ifftDIT28_avx2(x []byte, y []byte, table *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT28_avx2(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ x_len+8(FP), AX + MOVQ x_base+0(FP), CX + MOVQ y_base+24(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VMOVDQU (DX), Y5 + VMOVDQU 32(DX), Y6 + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + VMOVDQU Y5, (DX) + VMOVDQU Y6, 32(DX) + + // LEO_MULADD_256 + VPAND Y5, Y2, Y7 + VPSRLQ $0x04, Y5, Y5 + VPSHUFB Y7, Y0, Y7 + VPAND Y5, Y2, Y5 + VPSHUFB Y5, Y1, Y5 + XOR3WAY( $0x00, Y7, Y5, Y3) + + // LEO_MULADD_256 + VPAND Y6, Y2, Y5 + VPSRLQ $0x04, Y6, Y6 + VPSHUFB Y5, Y0, Y5 + VPAND Y6, Y2, Y6 + VPSHUFB Y6, Y1, Y6 + XOR3WAY( $0x00, Y5, Y6, Y4) + VMOVDQU Y3, (CX) + VMOVDQU Y4, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, DX + SUBQ $0x40, AX + JA loop + VZEROUPPER + RET + +// func fftDIT28_avx2(x []byte, y []byte, table *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT28_avx2(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ x_len+8(FP), AX + MOVQ x_base+0(FP), CX + MOVQ y_base+24(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VMOVDQU (DX), Y5 + VMOVDQU 32(DX), Y6 + + // LEO_MULADD_256 + VPAND Y5, Y2, Y7 + VPSRLQ $0x04, Y5, Y8 + VPSHUFB Y7, Y0, Y7 + VPAND Y8, Y2, Y8 + VPSHUFB Y8, Y1, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + + // LEO_MULADD_256 + VPAND Y6, Y2, Y7 + VPSRLQ $0x04, Y6, Y8 + VPSHUFB Y7, Y0, Y7 + VPAND Y8, Y2, Y8 + VPSHUFB Y8, Y1, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + VMOVDQU Y3, (CX) + VMOVDQU Y4, 32(CX) + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + VMOVDQU Y5, (DX) + VMOVDQU Y6, 32(DX) + ADDQ $0x40, CX + ADDQ $0x40, DX + SUBQ $0x40, AX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx2_0(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 16(AX), Y0 + MOVQ t23+40(FP), CX + VBROADCASTI128 (CX), Y1 + VBROADCASTI128 16(CX), Y2 + MOVQ t02+48(FP), CX + VBROADCASTI128 (CX), Y3 + VBROADCASTI128 16(CX), Y4 + MOVQ dist+24(FP), CX + MOVQ work_base+0(FP), DX + MOVQ 8(DX), BX + XORQ SI, SI + MOVQ (DX)(SI*1), DI + ADDQ CX, SI + MOVQ (DX)(SI*1), R8 + ADDQ CX, SI + MOVQ (DX)(SI*1), R9 + ADDQ CX, SI + MOVQ (DX)(SI*1), CX + MOVQ $0x0000000f, DX + MOVQ DX, X5 + VPBROADCASTB X5, Y5 + +loop: + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU 32(DI), Y8 + VMOVDQU 32(R8), Y9 + VPXOR Y7, Y6, Y7 + VPXOR Y9, Y8, Y9 + VBROADCASTI128 (AX), Y10 + + // LEO_MULADD_256 + VPAND Y7, Y5, Y11 + VPSRLQ $0x04, Y7, Y12 + VPSHUFB Y11, Y10, Y11 + VPAND Y12, Y5, Y12 + VPSHUFB Y12, Y0, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + + // LEO_MULADD_256 + VPAND Y9, Y5, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y10, Y11 + VPAND Y12, Y5, Y12 + VPSHUFB Y12, Y0, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VMOVDQU (R9), Y10 + VMOVDQU (CX), Y11 + VMOVDQU 32(R9), Y12 + VMOVDQU 32(CX), Y13 + VPXOR Y10, Y11, Y11 + VPXOR Y12, Y13, Y13 + + // LEO_MULADD_256 + VPAND Y11, Y5, Y14 + VPSRLQ $0x04, Y11, Y15 + VPSHUFB Y14, Y1, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y2, Y15 + XOR3WAY( $0x00, Y14, Y15, Y10) + + // LEO_MULADD_256 + VPAND Y13, Y5, Y14 + VPSRLQ $0x04, Y13, Y15 + VPSHUFB Y14, Y1, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y2, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + VPXOR Y9, Y13, Y13 + + // LEO_MULADD_256 + VPAND Y10, Y5, Y14 + VPSRLQ $0x04, Y10, Y15 + VPSHUFB Y14, Y3, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y4, Y15 + XOR3WAY( $0x00, Y14, Y15, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y5, Y14 + VPSRLQ $0x04, Y11, Y15 + VPSHUFB Y14, Y3, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y4, Y15 + XOR3WAY( $0x00, Y14, Y15, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y5, Y14 + VPSRLQ $0x04, Y12, Y15 + VPSHUFB Y14, Y3, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y4, Y15 + XOR3WAY( $0x00, Y14, Y15, Y8) + + // LEO_MULADD_256 + VPAND Y13, Y5, Y14 + VPSRLQ $0x04, Y13, Y15 + VPSHUFB Y14, Y3, Y14 + VPAND Y15, Y5, Y15 + VPSHUFB Y15, Y4, Y15 + XOR3WAY( $0x00, Y14, Y15, Y9) + VMOVDQU Y6, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y9, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y10, (R9) + VMOVDQU Y12, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y11, (CX) + VMOVDQU Y13, 32(CX) + ADDQ $0x40, CX + SUBQ $0x40, BX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx2_0(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx2_0(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 16(AX), Y0 + MOVQ t23+40(FP), CX + VBROADCASTI128 16(CX), Y1 + MOVQ t02+48(FP), DX + VBROADCASTI128 (DX), Y2 + VBROADCASTI128 16(DX), Y3 + MOVQ dist+24(FP), DX + MOVQ work_base+0(FP), BX + MOVQ 8(BX), SI + XORQ DI, DI + MOVQ (BX)(DI*1), R8 + ADDQ DX, DI + MOVQ (BX)(DI*1), R9 + ADDQ DX, DI + MOVQ (BX)(DI*1), R10 + ADDQ DX, DI + MOVQ (BX)(DI*1), DX + MOVQ $0x0000000f, BX + MOVQ BX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (R8), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y10 + VMOVDQU (R9), Y7 + VMOVDQU 32(R9), Y8 + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + VBROADCASTI128 (AX), Y13 + + // LEO_MULADD_256 + VPAND Y7, Y4, Y14 + VPSRLQ $0x04, Y7, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y0, Y15 + XOR3WAY( $0x00, Y14, Y15, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y14 + VPSRLQ $0x04, Y8, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y0, Y15 + XOR3WAY( $0x00, Y14, Y15, Y6) + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + VBROADCASTI128 (CX), Y13 + + // LEO_MULADD_256 + VPAND Y11, Y4, Y14 + VPSRLQ $0x04, Y11, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y1, Y15 + XOR3WAY( $0x00, Y14, Y15, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y14 + VPSRLQ $0x04, Y12, Y15 + VPSHUFB Y14, Y13, Y14 + VPAND Y15, Y4, Y15 + VPSHUFB Y15, Y1, Y15 + XOR3WAY( $0x00, Y14, Y15, Y10) + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y7, (R9) + VMOVDQU Y8, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y9, (R10) + VMOVDQU Y10, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y11, (DX) + VMOVDQU Y12, 32(DX) + ADDQ $0x40, DX + SUBQ $0x40, SI + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx2_1(SB), NOSPLIT, $0-56 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU 32(SI), Y7 + VMOVDQU 32(DI), Y8 + VPXOR Y6, Y5, Y6 + VPXOR Y8, Y7, Y8 + VMOVDQU (R8), Y9 + VMOVDQU (AX), Y10 + VMOVDQU 32(R8), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y9, Y10, Y10 + VPXOR Y11, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y11) + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) + VMOVDQU Y5, (SI) + VMOVDQU Y7, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y6, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y11, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y10, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx2_1(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx2_1(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (SI), Y5 + VMOVDQU 32(SI), Y6 + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y10 + VMOVDQU (DI), Y7 + VMOVDQU 32(DI), Y8 + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y7, Y4, Y13 + VPSRLQ $0x04, Y7, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y13 + VPSRLQ $0x04, Y8, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y10) + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y6, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y7, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y10, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y11, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx2_2(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU 32(SI), Y7 + VMOVDQU 32(DI), Y8 + VPXOR Y6, Y5, Y6 + VPXOR Y8, Y7, Y8 + + // LEO_MULADD_256 + VPAND Y6, Y4, Y9 + VPSRLQ $0x04, Y6, Y10 + VPSHUFB Y9, Y0, Y9 + VPAND Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y9 + VPSRLQ $0x04, Y8, Y10 + VPSHUFB Y9, Y0, Y9 + VPAND Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + VMOVDQU (R8), Y9 + VMOVDQU (AX), Y10 + VMOVDQU 32(R8), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y9, Y10, Y10 + VPXOR Y11, Y12, Y12 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) + VMOVDQU Y5, (SI) + VMOVDQU Y7, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y6, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y11, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y10, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx2_2(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx2_2(SB), NOSPLIT, $0-56 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (SI), Y5 + VMOVDQU 32(SI), Y6 + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y10 + VMOVDQU (DI), Y7 + VMOVDQU 32(DI), Y8 + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y10) + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y6, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y7, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y10, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y11, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx2_3(SB), NOSPLIT, $0-56 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU (DI), Y4 + VMOVDQU 32(SI), Y5 + VMOVDQU 32(DI), Y6 + VPXOR Y4, Y3, Y4 + VPXOR Y6, Y5, Y6 + VMOVDQU (R8), Y7 + VMOVDQU (AX), Y8 + VMOVDQU 32(R8), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y7, Y8, Y8 + VPXOR Y9, Y10, Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + + // LEO_MULADD_256 + VPAND Y7, Y2, Y11 + VPSRLQ $0x04, Y7, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + + // LEO_MULADD_256 + VPAND Y8, Y2, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + + // LEO_MULADD_256 + VPAND Y9, Y2, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VMOVDQU Y3, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y4, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y9, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y8, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx2_3(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx2_3(SB), NOSPLIT, $0-56 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU 32(SI), Y4 + VMOVDQU (R8), Y7 + VMOVDQU 32(R8), Y8 + VMOVDQU (DI), Y5 + VMOVDQU 32(DI), Y6 + VMOVDQU (AX), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y5, Y9, Y9 + VPXOR Y4, Y8, Y8 + VPXOR Y6, Y10, Y10 + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + + // LEO_MULADD_256 + VPAND Y9, Y2, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + + // LEO_MULADD_256 + VPAND Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y7, Y9, Y9 + VPXOR Y8, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y4, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y5, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y8, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y9, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx2_4(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU 32(SI), Y7 + VMOVDQU 32(DI), Y8 + VPXOR Y6, Y5, Y6 + VPXOR Y8, Y7, Y8 + + // LEO_MULADD_256 + VPAND Y6, Y4, Y9 + VPSRLQ $0x04, Y6, Y10 + VPSHUFB Y9, Y0, Y9 + VPAND Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y9, Y10, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y9 + VPSRLQ $0x04, Y8, Y10 + VPSHUFB Y9, Y0, Y9 + VPAND Y10, Y4, Y10 + VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) + VMOVDQU (R8), Y9 + VMOVDQU (AX), Y10 + VMOVDQU 32(R8), Y11 + VMOVDQU 32(AX), Y12 + VPXOR Y9, Y10, Y10 + VPXOR Y11, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y9) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y11) + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VPXOR Y7, Y11, Y11 + VPXOR Y8, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y7, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y6, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y11, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y10, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx2_4(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx2_4(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y2 + VBROADCASTI128 16(AX), Y3 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X4 + VPBROADCASTB X4, Y4 + +loop: + VMOVDQU (SI), Y5 + VMOVDQU 32(SI), Y6 + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y10 + VMOVDQU (DI), Y7 + VMOVDQU 32(DI), Y8 + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y12 + + // LEO_MULADD_256 + VPAND Y9, Y4, Y13 + VPSRLQ $0x04, Y9, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y4, Y13 + VPSRLQ $0x04, Y10, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + + // LEO_MULADD_256 + VPAND Y11, Y4, Y13 + VPSRLQ $0x04, Y11, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y7) + + // LEO_MULADD_256 + VPAND Y12, Y4, Y13 + VPSRLQ $0x04, Y12, Y14 + VPSHUFB Y13, Y2, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y3, Y14 + XOR3WAY( $0x00, Y13, Y14, Y8) + VPXOR Y5, Y9, Y9 + VPXOR Y7, Y11, Y11 + VPXOR Y6, Y10, Y10 + VPXOR Y8, Y12, Y12 + + // LEO_MULADD_256 + VPAND Y7, Y4, Y13 + VPSRLQ $0x04, Y7, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y5) + + // LEO_MULADD_256 + VPAND Y8, Y4, Y13 + VPSRLQ $0x04, Y8, Y14 + VPSHUFB Y13, Y0, Y13 + VPAND Y14, Y4, Y14 + VPSHUFB Y14, Y1, Y14 + XOR3WAY( $0x00, Y13, Y14, Y6) + VPXOR Y7, Y5, Y7 + VPXOR Y8, Y6, Y8 + VPXOR Y9, Y11, Y11 + VPXOR Y10, Y12, Y12 + VMOVDQU Y5, (SI) + VMOVDQU Y6, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y7, (DI) + VMOVDQU Y8, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y9, (R8) + VMOVDQU Y10, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y11, (AX) + VMOVDQU Y12, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx2_5(SB), NOSPLIT, $0-56 + MOVQ t23+40(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU (DI), Y4 + VMOVDQU 32(SI), Y5 + VMOVDQU 32(DI), Y6 + VPXOR Y4, Y3, Y4 + VPXOR Y6, Y5, Y6 + VMOVDQU (R8), Y7 + VMOVDQU (AX), Y8 + VMOVDQU 32(R8), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y7, Y8, Y8 + VPXOR Y9, Y10, Y10 + + // LEO_MULADD_256 + VPAND Y8, Y2, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y7) + + // LEO_MULADD_256 + VPAND Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y4, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y9, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y8, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx2_5(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx2_5(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU 32(SI), Y4 + VMOVDQU (R8), Y7 + VMOVDQU 32(R8), Y8 + VMOVDQU (DI), Y5 + VMOVDQU 32(DI), Y6 + VMOVDQU (AX), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y5, Y9, Y9 + VPXOR Y4, Y8, Y8 + VPXOR Y6, Y10, Y10 + + // LEO_MULADD_256 + VPAND Y5, Y2, Y11 + VPSRLQ $0x04, Y5, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + + // LEO_MULADD_256 + VPAND Y6, Y2, Y11 + VPSRLQ $0x04, Y6, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + VPXOR Y7, Y9, Y9 + VPXOR Y8, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y4, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y5, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y8, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y9, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT48_avx2_6(SB), NOSPLIT, $0-56 + MOVQ t01+32(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU (DI), Y4 + VMOVDQU 32(SI), Y5 + VMOVDQU 32(DI), Y6 + VPXOR Y4, Y3, Y4 + VPXOR Y6, Y5, Y6 + + // LEO_MULADD_256 + VPAND Y4, Y2, Y7 + VPSRLQ $0x04, Y4, Y8 + VPSHUFB Y7, Y0, Y7 + VPAND Y8, Y2, Y8 + VPSHUFB Y8, Y1, Y8 + XOR3WAY( $0x00, Y7, Y8, Y3) + + // LEO_MULADD_256 + VPAND Y6, Y2, Y7 + VPSRLQ $0x04, Y6, Y8 + VPSHUFB Y7, Y0, Y7 + VPAND Y8, Y2, Y8 + VPSHUFB Y8, Y1, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) + VMOVDQU (R8), Y7 + VMOVDQU (AX), Y8 + VMOVDQU 32(R8), Y9 + VMOVDQU 32(AX), Y10 + VPXOR Y7, Y8, Y8 + VPXOR Y9, Y10, Y10 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + VPXOR Y5, Y9, Y9 + VPXOR Y6, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y4, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y9, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y8, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx2_6(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT48_avx2_6(SB), NOSPLIT, $0-56 + MOVQ t02+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 16(AX), Y1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X2 + VPBROADCASTB X2, Y2 + +loop: + VMOVDQU (SI), Y3 + VMOVDQU 32(SI), Y4 + VMOVDQU (R8), Y7 + VMOVDQU 32(R8), Y8 + VMOVDQU (DI), Y5 + VMOVDQU 32(DI), Y6 + VMOVDQU (AX), Y9 + VMOVDQU 32(AX), Y10 + + // LEO_MULADD_256 + VPAND Y7, Y2, Y11 + VPSRLQ $0x04, Y7, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y3) + + // LEO_MULADD_256 + VPAND Y8, Y2, Y11 + VPSRLQ $0x04, Y8, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y4) + + // LEO_MULADD_256 + VPAND Y9, Y2, Y11 + VPSRLQ $0x04, Y9, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y5) + + // LEO_MULADD_256 + VPAND Y10, Y2, Y11 + VPSRLQ $0x04, Y10, Y12 + VPSHUFB Y11, Y0, Y11 + VPAND Y12, Y2, Y12 + VPSHUFB Y12, Y1, Y12 + XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y3, Y7, Y7 + VPXOR Y5, Y9, Y9 + VPXOR Y4, Y8, Y8 + VPXOR Y6, Y10, Y10 + VPXOR Y5, Y3, Y5 + VPXOR Y6, Y4, Y6 + VPXOR Y7, Y9, Y9 + VPXOR Y8, Y10, Y10 + VMOVDQU Y3, (SI) + VMOVDQU Y4, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y5, (DI) + VMOVDQU Y6, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y7, (R8) + VMOVDQU Y8, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y9, (AX) + VMOVDQU Y10, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·ifftDIT48_avx2_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X0 + VPBROADCASTB X0, Y0 + +loop: + VMOVDQU (SI), Y0 + VMOVDQU (DI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU 32(DI), Y3 + VPXOR Y1, Y0, Y1 + VPXOR Y3, Y2, Y3 + VMOVDQU (R8), Y4 + VMOVDQU (AX), Y5 + VMOVDQU 32(R8), Y6 + VMOVDQU 32(AX), Y7 + VPXOR Y4, Y5, Y5 + VPXOR Y6, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VMOVDQU Y0, (SI) + VMOVDQU Y2, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y1, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (R8) + VMOVDQU Y6, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y5, (AX) + VMOVDQU Y7, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_avx2_7(work [][]byte, dist int, t01 *[32]uint8, t23 *[32]uint8, t02 *[32]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·fftDIT48_avx2_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + MOVQ $0x0000000f, CX + MOVQ CX, X0 + VPBROADCASTB X0, Y0 + +loop: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (R8), Y4 + VMOVDQU 32(R8), Y5 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + VMOVDQU (AX), Y6 + VMOVDQU 32(AX), Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y2, Y6, Y6 + VPXOR Y1, Y5, Y5 + VPXOR Y3, Y7, Y7 + VPXOR Y2, Y0, Y2 + VPXOR Y3, Y1, Y3 + VPXOR Y4, Y6, Y6 + VPXOR Y5, Y7, Y7 + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y6, (AX) + VMOVDQU Y7, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_0(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + VBROADCASTF32X2 t02+48(FP), Z2 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z3 + VMOVDQU64 (DI), Z4 + VMOVDQU64 (R8), Z5 + VMOVDQU64 (AX), Z6 + VXORPD Z4, Z3, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 + VXORPD Z3, Z7, Z3 + VXORPD Z5, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VPTERNLOGD $0x96, Z7, Z3, Z5 + VXORPD Z4, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 + VXORPD Z3, Z7, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z6, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_0(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + VBROADCASTF32X2 t02+48(FP), Z2 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z3 + VMOVDQU64 (DI), Z4 + VMOVDQU64 (R8), Z5 + VMOVDQU64 (AX), Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 + VXORPD Z3, Z7, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VXORPD Z3, Z5, Z5 + VXORPD Z4, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 + VXORPD Z3, Z7, Z3 + VXORPD Z4, Z3, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z5, Z7, Z5 + VXORPD Z5, Z6, Z6 + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z6, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_1(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + VXORPD Z4, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 + VPTERNLOGD $0x96, Z6, Z2, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_1(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z4, Z6, Z4 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_2(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z4, Z5, Z5 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_2(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 + VXORPD Z4, Z6, Z4 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_3(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t02+48(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 + VXORPD Z1, Z5, Z1 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_3(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z2, Z1, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z3, Z5, Z3 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_4(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z4, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VPTERNLOGD $0x96, Z6, Z2, Z4 + VXORPD Z3, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_4(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z3, Z2, Z3 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_5(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VPTERNLOGD $0x96, Z5, Z1, Z3 + VXORPD Z2, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_5(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 + VXORPD Z1, Z5, Z1 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_6(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 + VXORPD Z1, Z5, Z1 + VXORPD Z3, Z4, Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_6(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t02+48(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 + VXORPD Z1, Z5, Z1 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_gfni_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z0 + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R8), Z2 + VMOVDQU64 (AX), Z3 + VXORPD Z1, Z0, Z1 + VXORPD Z2, Z3, Z3 + VXORPD Z0, Z2, Z2 + VXORPD Z1, Z3, Z3 + VMOVDQU64 Z0, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z1, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z3, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_gfni_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z0 + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R8), Z2 + VMOVDQU64 (AX), Z3 + VXORPD Z0, Z2, Z2 + VXORPD Z1, Z3, Z3 + VXORPD Z1, Z0, Z1 + VXORPD Z2, Z3, Z3 + VMOVDQU64 Z0, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z1, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z3, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go new file mode 100644 index 000000000..1bb268a3b --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_none.go @@ -0,0 +1,33 @@ +//go:build !amd64 || noasm || appengine || gccgo || nogen + +package reedsolomon + +const maxAvx2Inputs = 1 +const maxAvx2Outputs = 1 +const minAvx2Size = 1 +const avxSizeMask = 0 +const avx2CodeGen = false + +func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { + panic("codegen not available") +} + +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { + panic("codegen not available") +} + +func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + panic("codegen not available") +} + +func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + panic("codegen not available") +} + +func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + panic("codegen not available") +} + +func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + panic("codegen not available") +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.go new file mode 100644 index 000000000..298bf5040 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.go @@ -0,0 +1,2264 @@ +// Code generated by command: go run gen.go -out ../galois_gen_nopshufb_amd64.s -stubs ../galois_gen_nopshufb_amd64.go -pkg=reedsolomon. DO NOT EDIT. + +//go:build !appengine && !noasm && !nogen && nopshufb && gc + +package reedsolomon + +func _dummy_() + +//go:noescape +func sSE2XorSlice(in []byte, out []byte) + +//go:noescape +func sSE2XorSlice_64(in []byte, out []byte) + +//go:noescape +func avx2XorSlice_64(in []byte, out []byte) + +// mulGFNI_1x1_64 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x1 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x1Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x2_64 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x2 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x2Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x3_64 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x3 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x3Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x4_64 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x4 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x4Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x5_64 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x5 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x5Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x6_64 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x6 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x6Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x7_64 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x7 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x7Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x8_64 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x8 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x8Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x9_64 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x9 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x9Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x10_64 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x10 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_1x10Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x1_64 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x1 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x1Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x2_64 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x2 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x2Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x3_64 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x3 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x3Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x4_64 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x4 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x4Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x5_64 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x5 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x5Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x6_64 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x6 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x6Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x7_64 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x7 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x7Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x8_64 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x8 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x8Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x9_64 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x9 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x9Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x10_64 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x10 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_2x10Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x1_64 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x1 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x1Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x2_64 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x2 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x2Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x3_64 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x3 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x3Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x4_64 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x4 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x4Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x5_64 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x5 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x5Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x6_64 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x6 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x6Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x7_64 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x7 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x7Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x8_64 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x8 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x8Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x9_64 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x9 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x9Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x10_64 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x10 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_3x10Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x1_64 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x1 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x1Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x2_64 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x2 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x2Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x3_64 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x3 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x3Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x4_64 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x4 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x4Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x5_64 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x5 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x5Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x6_64 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x6 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x6Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x7_64 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x7 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x7Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x8_64 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x8 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x8Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x9_64 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x9 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x9Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x10_64 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x10 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_4x10Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x1_64 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x1 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x1Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x2_64 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x2 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x2Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x3_64 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x3 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x3Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x4_64 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x4 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x4Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x5_64 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x5 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x5Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x6_64 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x6 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x6Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x7_64 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x7 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x7Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x8_64 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x8 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x8Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x9_64 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x9 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x9Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x10_64 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x10 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_5x10Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x1_64 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x1 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x1Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x2_64 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x2 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x2Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x3_64 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x3 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x3Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x4_64 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x4 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x4Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x5_64 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x5 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x5Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x6_64 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x6 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x6Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x7_64 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x7 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x7Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x8_64 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x8 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x8Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x9_64 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x9 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x9Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x10_64 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x10 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_6x10Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x1_64 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x1 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x1Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x2_64 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x2 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x2Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x3_64 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x3 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x3Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x4_64 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x4 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x4Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x5_64 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x5 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x5Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x6_64 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x6 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x6Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x7_64 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x7 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x7Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x8_64 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x8 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x8Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x9_64 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x9 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x9Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x10_64 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x10 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x10Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x1_64 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x1 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x1Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x2_64 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x2 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x2Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x3_64 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x3 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x3Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x4_64 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x4 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x4Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x5_64 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x5 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x5Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x6_64 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x6 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x6Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x7_64 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x7 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x7Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x8_64 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x8 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x8Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x9_64 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x9 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x9Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x10_64 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x10 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_8x10Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x1_64 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x1 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x1Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x2_64 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x2 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x2Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x3_64 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x3 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x3Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x4_64 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x4 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x4Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x5_64 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x5 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x5Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x6_64 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x6 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x6Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x7_64 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x7 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x7Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x8_64 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x8 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x8Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x9_64 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x9 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x9Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x10_64 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x10 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_9x10Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x1_64 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x1 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x1Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x2_64 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x2 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x2Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x3_64 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x3 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x3Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x4_64 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x4 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x4Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x5_64 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x5 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x5Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x6_64 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x6 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x6Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x7_64 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x7 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x7Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x8_64 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x8 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x8Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x9_64 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x9 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x9Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x10_64 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x10 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_10x10Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) + +//go:noescape +func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.s new file mode 100644 index 000000000..5782759c6 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_nopshufb_amd64.s @@ -0,0 +1,67987 @@ +// Code generated by command: go run gen.go -out ../galois_gen_nopshufb_amd64.s -stubs ../galois_gen_nopshufb_amd64.go -pkg=reedsolomon. DO NOT EDIT. + +//go:build !appengine && !noasm && !nogen && nopshufb && gc + +#include "textflag.h" + +// func _dummy_() +TEXT ·_dummy_(SB), $0 +#ifdef GOAMD64_v4 +#define XOR3WAY(ignore, a, b, dst) \ + VPTERNLOGD $0x96, a, b, dst + +#else +#define XOR3WAY(ignore, a, b, dst) \ + VPXOR a, dst, dst \ + VPXOR b, dst, dst + +#endif + RET + +// sSE2XorSlice will XOR in with out and store in out. +// Processes 16 bytes/loop. + +// func sSE2XorSlice(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x04, DX + JZ end + +loop: + MOVOU (AX), X0 + MOVOU (CX), X1 + PXOR X0, X1 + MOVOU X1, (CX) + ADDQ $0x10, AX + ADDQ $0x10, CX + DECQ DX + JNZ loop + +end: + RET + +// sSE2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func sSE2XorSlice_64(in []byte, out []byte) +// Requires: SSE2 +TEXT ·sSE2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end + +loop: + MOVOU (AX), X0 + MOVOU 16(AX), X2 + MOVOU 32(AX), X4 + MOVOU 48(AX), X6 + MOVOU (CX), X1 + MOVOU 16(CX), X3 + MOVOU 32(CX), X5 + MOVOU 48(CX), X7 + PXOR X0, X1 + PXOR X2, X3 + PXOR X4, X5 + PXOR X6, X7 + MOVOU X1, (CX) + MOVOU X3, 16(CX) + MOVOU X5, 32(CX) + MOVOU X7, 48(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop + +end: + RET + +// avx2XorSlice_64 will XOR in with out and store in out. +// Processes 64 bytes/loop. + +// func avx2XorSlice_64(in []byte, out []byte) +// Requires: AVX, AVX2 +TEXT ·avx2XorSlice_64(SB), $0-48 + MOVQ in_base+0(FP), AX + MOVQ out_base+24(FP), CX + MOVQ in_len+8(FP), DX + SHRQ $0x06, DX + JZ end + +loop: + VMOVDQU (AX), Y0 + VMOVDQU 32(AX), Y2 + VMOVDQU (CX), Y1 + VMOVDQU 32(CX), Y3 + VPXOR Y0, Y1, Y1 + VPXOR Y2, Y3, Y3 + VMOVDQU Y1, (CX) + VMOVDQU Y3, 32(CX) + ADDQ $0x40, AX + ADDQ $0x40, CX + DECQ DX + JNZ loop + +end: + VZEROUPPER + RET + +// func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x1_64_end + VBROADCASTF32X2 (CX), Z0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulGFNI_1x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (CX), Z1 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z1, Z1 + + // Store 1 outputs + VMOVDQU64 Z1, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x1_64_loop + VZEROUPPER + +mulGFNI_1x1_64_end: + RET + +// func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvxGFNI_1x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y1 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x1_loop + VZEROUPPER + +mulAvxGFNI_1x1_end: + RET + +// func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulGFNI_1x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (DX), Z1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (CX), Z2 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z2, Z2 + VXORPD Z1, Z2, Z1 + + // Store 1 outputs + VMOVDQU64 Z1, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x1_64Xor_loop + VZEROUPPER + +mulGFNI_1x1_64Xor_end: + RET + +// func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1Xor_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvxGFNI_1x1Xor_loop: + // Load 1 outputs + VMOVDQU (DX), Y1 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y2 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y2, Y2 + VXORPD Y1, Y2, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x1Xor_loop + VZEROUPPER + +mulAvxGFNI_1x1Xor_end: + RET + +// func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulGFNI_1x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z2 + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + + // Store 2 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z3, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x2_64_loop + VZEROUPPER + +mulGFNI_1x2_64_end: + RET + +// func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvxGFNI_1x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x2_loop + VZEROUPPER + +mulAvxGFNI_1x2_end: + RET + +// func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulGFNI_1x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (BX), Z2 + VMOVDQU64 (DX), Z3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z5 + VXORPD Z3, Z5, Z3 + + // Store 2 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z3, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x2_64Xor_loop + VZEROUPPER + +mulGFNI_1x2_64Xor_end: + RET + +// func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvxGFNI_1x2Xor_loop: + // Load 2 outputs + VMOVDQU (BX), Y2 + VMOVDQU (DX), Y3 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y5 + VXORPD Y2, Y5, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y4, Y5 + VXORPD Y3, Y5, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x2Xor_loop + VZEROUPPER + +mulAvxGFNI_1x2Xor_end: + RET + +// func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulGFNI_1x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z3 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z4 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + + // Store 3 outputs + VMOVDQU64 Z3, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x3_64_loop + VZEROUPPER + +mulGFNI_1x3_64_end: + RET + +// func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvxGFNI_1x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y5, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x3_loop + VZEROUPPER + +mulAvxGFNI_1x3_end: + RET + +// func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulGFNI_1x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (BX), Z3 + VMOVDQU64 (SI), Z4 + VMOVDQU64 (DX), Z5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z7 + VXORPD Z3, Z7, Z3 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 3 outputs + VMOVDQU64 Z3, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x3_64Xor_loop + VZEROUPPER + +mulGFNI_1x3_64Xor_end: + RET + +// func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvxGFNI_1x3Xor_loop: + // Load 3 outputs + VMOVDQU (BX), Y3 + VMOVDQU (SI), Y4 + VMOVDQU (DX), Y5 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y3, Y7, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x3Xor_loop + VZEROUPPER + +mulAvxGFNI_1x3Xor_end: + RET + +// func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulGFNI_1x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z7, Z5 + VGF2P8AFFINEQB $0x00, Z2, Z7, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + + // Store 4 outputs + VMOVDQU64 Z4, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z5, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x4_64_loop + VZEROUPPER + +mulGFNI_1x4_64_end: + RET + +// func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y7, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y7, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4_loop + VZEROUPPER + +mulAvxGFNI_1x4_end: + RET + +// func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulGFNI_1x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (BX), Z4 + VMOVDQU64 (SI), Z5 + VMOVDQU64 (DI), Z6 + VMOVDQU64 (DX), Z7 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z9 + VXORPD Z4, Z9, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z9 + VXORPD Z5, Z9, Z5 + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 4 outputs + VMOVDQU64 Z4, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z5, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x4_64Xor_loop + VZEROUPPER + +mulGFNI_1x4_64Xor_end: + RET + +// func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4Xor_loop: + // Load 4 outputs + VMOVDQU (BX), Y4 + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (DX), Y7 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y4, Y9, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y5, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4Xor_loop + VZEROUPPER + +mulAvxGFNI_1x4Xor_end: + RET + +// func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulGFNI_1x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z5 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z7 + VGF2P8AFFINEQB $0x00, Z3, Z9, Z8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + + // Store 5 outputs + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x5_64_loop + VZEROUPPER + +mulGFNI_1x5_64_end: + RET + +// func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y9, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5_loop + VZEROUPPER + +mulAvxGFNI_1x5_end: + RET + +// func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulGFNI_1x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (BX), Z5 + VMOVDQU64 (SI), Z6 + VMOVDQU64 (DI), Z7 + VMOVDQU64 (R8), Z8 + VMOVDQU64 (DX), Z9 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z11 + VXORPD Z5, Z11, Z5 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z11 + VXORPD Z6, Z11, Z6 + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z7, Z11, Z7 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 5 outputs + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x5_64Xor_loop + VZEROUPPER + +mulGFNI_1x5_64Xor_end: + RET + +// func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5Xor_loop: + // Load 5 outputs + VMOVDQU (BX), Y5 + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (DX), Y9 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y5, Y11, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y6, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y7, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5Xor_loop + VZEROUPPER + +mulAvxGFNI_1x5Xor_end: + RET + +// func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulGFNI_1x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z11, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z11, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + + // Store 6 outputs + VMOVDQU64 Z6, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z9, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x6_64_loop + VZEROUPPER + +mulGFNI_1x6_64_end: + RET + +// func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y11, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6_loop + VZEROUPPER + +mulAvxGFNI_1x6_end: + RET + +// func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulGFNI_1x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (BX), Z6 + VMOVDQU64 (SI), Z7 + VMOVDQU64 (DI), Z8 + VMOVDQU64 (R8), Z9 + VMOVDQU64 (R9), Z10 + VMOVDQU64 (DX), Z11 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z6, Z13, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z7, Z13, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 6 outputs + VMOVDQU64 Z6, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z8, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z9, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x6_64Xor_loop + VZEROUPPER + +mulGFNI_1x6_64Xor_end: + RET + +// func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6Xor_loop: + // Load 6 outputs + VMOVDQU (BX), Y6 + VMOVDQU (SI), Y7 + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (DX), Y11 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y6, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y7, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6Xor_loop + VZEROUPPER + +mulAvxGFNI_1x6Xor_end: + RET + +// func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulGFNI_1x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (CX), Z13 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z13, Z7 + VGF2P8AFFINEQB $0x00, Z1, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z2, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z3, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z4, Z13, Z11 + VGF2P8AFFINEQB $0x00, Z5, Z13, Z12 + VGF2P8AFFINEQB $0x00, Z6, Z13, Z13 + + // Store 7 outputs + VMOVDQU64 Z7, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x7_64_loop + VZEROUPPER + +mulGFNI_1x7_64_end: + RET + +// func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y13 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y13, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7_loop + VZEROUPPER + +mulAvxGFNI_1x7_end: + RET + +// func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulGFNI_1x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (BX), Z7 + VMOVDQU64 (SI), Z8 + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (R9), Z11 + VMOVDQU64 (R10), Z12 + VMOVDQU64 (DX), Z13 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z15 + VXORPD Z7, Z15, Z7 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z15 + VXORPD Z8, Z15, Z8 + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z9, Z15, Z9 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z10, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z11, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 7 outputs + VMOVDQU64 Z7, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x7_64Xor_loop + VZEROUPPER + +mulGFNI_1x7_64Xor_end: + RET + +// func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7Xor_loop: + // Load 7 outputs + VMOVDQU (BX), Y7 + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DX), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7Xor_loop + VZEROUPPER + +mulAvxGFNI_1x7Xor_end: + RET + +// func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x8_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DX + + // Add start offset to input + ADDQ R12, CX + +mulGFNI_1x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z15 + + // Store 8 outputs + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z10, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z12, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z13, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x8_64_loop + VZEROUPPER + +mulGFNI_1x8_64_end: + RET + +// func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 56(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8_loop + VZEROUPPER + +mulAvxGFNI_1x8_end: + RET + +// func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x8_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DX + + // Add start offset to input + ADDQ R12, CX + +mulGFNI_1x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (BX), Z8 + VMOVDQU64 (SI), Z9 + VMOVDQU64 (DI), Z10 + VMOVDQU64 (R8), Z11 + VMOVDQU64 (R9), Z12 + VMOVDQU64 (R10), Z13 + VMOVDQU64 (R11), Z14 + VMOVDQU64 (DX), Z15 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z8, Z17, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z9, Z17, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z10, Z17, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z11, Z17, Z11 + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 8 outputs + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z9, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z10, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z12, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z13, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x8_64Xor_loop + VZEROUPPER + +mulGFNI_1x8_64Xor_end: + RET + +// func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8Xor(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8Xor_loop: + // Load 8 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8Xor_loop + VZEROUPPER + +mulAvxGFNI_1x8Xor_end: + RET + +// func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x9_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DX + + // Add start offset to input + ADDQ R13, CX + +mulGFNI_1x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (CX), Z17 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z17, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z17, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z17, Z11 + VGF2P8AFFINEQB $0x00, Z3, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z6, Z17, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z17, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z17, Z17 + + // Store 9 outputs + VMOVDQU64 Z9, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z15, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x9_64_loop + VZEROUPPER + +mulGFNI_1x9_64_end: + RET + +// func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvxGFNI_1x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 64(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x9_loop + VZEROUPPER + +mulAvxGFNI_1x9_end: + RET + +// func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x9_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DX + + // Add start offset to input + ADDQ R13, CX + +mulGFNI_1x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (BX), Z9 + VMOVDQU64 (SI), Z10 + VMOVDQU64 (DI), Z11 + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (R10), Z14 + VMOVDQU64 (R11), Z15 + VMOVDQU64 (R12), Z16 + VMOVDQU64 (DX), Z17 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z9, Z19, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z10, Z19, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z11, Z19, Z11 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 9 outputs + VMOVDQU64 Z9, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z15, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x9_64Xor_loop + VZEROUPPER + +mulGFNI_1x9_64Xor_end: + RET + +// func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9Xor(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvxGFNI_1x9Xor_loop: + // Load 9 outputs + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x9Xor_loop + VZEROUPPER + +mulAvxGFNI_1x9Xor_end: + RET + +// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x10_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z19 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z19, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z19, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z19, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z19, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z19, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64_loop + VZEROUPPER + +mulGFNI_1x10_64_end: + RET + +// func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y13, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y13, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 72(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10_loop + VZEROUPPER + +mulAvxGFNI_1x10_end: + RET + +// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (BX), Z10 + VMOVDQU64 (SI), Z11 + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (R10), Z15 + VMOVDQU64 (R11), Z16 + VMOVDQU64 (R12), Z17 + VMOVDQU64 (R13), Z18 + VMOVDQU64 (DX), Z19 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z10, Z21, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z11, Z21, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z12, Z21, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z13, Z21, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z14, Z21, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64Xor_loop + VZEROUPPER + +mulGFNI_1x10_64Xor_end: + RET + +// func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10Xor(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10Xor_loop: + // Load 10 outputs + VMOVDQU (SI), Y4 + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10Xor_loop + VZEROUPPER + +mulAvxGFNI_1x10Xor_end: + RET + +// func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulGFNI_2x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z2 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Store 1 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x1_64_loop + VZEROUPPER + +mulGFNI_2x1_64_end: + RET + +// func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x1_loop + VZEROUPPER + +mulAvxGFNI_2x1_end: + RET + +// func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulGFNI_2x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (BX), Z2 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z3 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (CX), Z3 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z1, Z3, Z3 + VXORPD Z2, Z3, Z2 + + // Store 1 outputs + VMOVDQU64 Z2, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x1_64Xor_loop + VZEROUPPER + +mulGFNI_2x1_64Xor_end: + RET + +// func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1Xor_loop: + // Load 1 outputs + VMOVDQU (BX), Y2 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x1Xor_loop + VZEROUPPER + +mulAvxGFNI_2x1Xor_end: + RET + +// func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulGFNI_2x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z5 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z3, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 2 outputs + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x2_64_loop + VZEROUPPER + +mulGFNI_2x2_64_end: + RET + +// func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x2_loop + VZEROUPPER + +mulAvxGFNI_2x2_end: + RET + +// func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulGFNI_2x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (SI), Z4 + VMOVDQU64 (BX), Z5 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VGF2P8AFFINEQB $0x00, Z3, Z6, Z7 + VXORPD Z5, Z7, Z5 + + // Store 2 outputs + VMOVDQU64 Z4, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z5, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x2_64Xor_loop + VZEROUPPER + +mulGFNI_2x2_64Xor_end: + RET + +// func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2Xor_loop: + // Load 2 outputs + VMOVDQU (SI), Y4 + VMOVDQU (BX), Y5 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x2Xor_loop + VZEROUPPER + +mulAvxGFNI_2x2Xor_end: + RET + +// func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulGFNI_2x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z8 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Store 3 outputs + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x3_64_loop + VZEROUPPER + +mulGFNI_2x3_64_end: + RET + +// func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x3_loop + VZEROUPPER + +mulAvxGFNI_2x3_end: + RET + +// func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulGFNI_2x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (SI), Z6 + VMOVDQU64 (DI), Z7 + VMOVDQU64 (BX), Z8 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z2, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z9, Z10 + VXORPD Z6, Z10, Z6 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z10 + VXORPD Z7, Z10, Z7 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z10 + VXORPD Z8, Z10, Z8 + + // Store 3 outputs + VMOVDQU64 Z6, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z7, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z8, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x3_64Xor_loop + VZEROUPPER + +mulGFNI_2x3_64Xor_end: + RET + +// func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3Xor_loop: + // Load 3 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (BX), Y8 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x3Xor_loop + VZEROUPPER + +mulAvxGFNI_2x3Xor_end: + RET + +// func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulGFNI_2x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z11 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 4 outputs + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x4_64_loop + VZEROUPPER + +mulGFNI_2x4_64_end: + RET + +// func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4_loop + VZEROUPPER + +mulAvxGFNI_2x4_end: + RET + +// func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulGFNI_2x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (SI), Z8 + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (BX), Z11 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z8, Z13, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 4 outputs + VMOVDQU64 Z8, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x4_64Xor_loop + VZEROUPPER + +mulGFNI_2x4_64Xor_end: + RET + +// func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4Xor_loop: + // Load 4 outputs + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (BX), Y11 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4Xor_loop + VZEROUPPER + +mulAvxGFNI_2x4Xor_end: + RET + +// func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, CX + +mulGFNI_2x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z14 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 5 outputs + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x5_64_loop + VZEROUPPER + +mulGFNI_2x5_64_end: + RET + +// func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5_loop + VZEROUPPER + +mulAvxGFNI_2x5_end: + RET + +// func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, CX + +mulGFNI_2x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (SI), Z10 + VMOVDQU64 (DI), Z11 + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (BX), Z14 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z10, Z16, Z10 + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z11, Z16, Z11 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 5 outputs + VMOVDQU64 Z10, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z11, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x5_64Xor_loop + VZEROUPPER + +mulGFNI_2x5_64Xor_end: + RET + +// func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5Xor(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5Xor_loop: + // Load 5 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5Xor_loop + VZEROUPPER + +mulAvxGFNI_2x5Xor_end: + RET + +// func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, CX + +mulGFNI_2x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z17 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 6 outputs + VMOVDQU64 Z12, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z13, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z14, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x6_64_loop + VZEROUPPER + +mulGFNI_2x6_64_end: + RET + +// func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6_loop + VZEROUPPER + +mulAvxGFNI_2x6_end: + RET + +// func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, CX + +mulGFNI_2x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (SI), Z12 + VMOVDQU64 (DI), Z13 + VMOVDQU64 (R8), Z14 + VMOVDQU64 (R9), Z15 + VMOVDQU64 (R10), Z16 + VMOVDQU64 (BX), Z17 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z12, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z13, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z14, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 6 outputs + VMOVDQU64 Z12, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z13, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z14, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x6_64Xor_loop + VZEROUPPER + +mulGFNI_2x6_64Xor_end: + RET + +// func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6Xor(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6Xor_loop: + // Load 6 outputs + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6Xor_loop + VZEROUPPER + +mulAvxGFNI_2x6Xor_end: + RET + +// func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, CX + +mulGFNI_2x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z19 + VGF2P8AFFINEQB $0x00, Z6, Z21, Z20 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 7 outputs + VMOVDQU64 Z14, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x7_64_loop + VZEROUPPER + +mulGFNI_2x7_64_end: + RET + +// func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7_loop + VZEROUPPER + +mulAvxGFNI_2x7_end: + RET + +// func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, CX + +mulGFNI_2x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (SI), Z14 + VMOVDQU64 (DI), Z15 + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (R11), Z19 + VMOVDQU64 (BX), Z20 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z14, Z22, Z14 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z15, Z22, Z15 + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z16, Z22, Z16 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z17, Z22, Z17 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 7 outputs + VMOVDQU64 Z14, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x7_64Xor_loop + VZEROUPPER + +mulGFNI_2x7_64Xor_end: + RET + +// func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7Xor(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7Xor_loop: + // Load 7 outputs + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7Xor_loop + VZEROUPPER + +mulAvxGFNI_2x7Xor_end: + RET + +// func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x8_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, CX + +mulGFNI_2x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z19 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z23 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 8 outputs + VMOVDQU64 Z16, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z17, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z18, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z19, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z20, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x8_64_loop + VZEROUPPER + +mulGFNI_2x8_64_end: + RET + +// func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8_loop + VZEROUPPER + +mulAvxGFNI_2x8_end: + RET + +// func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x8_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, CX + +mulGFNI_2x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (SI), Z16 + VMOVDQU64 (DI), Z17 + VMOVDQU64 (R8), Z18 + VMOVDQU64 (R9), Z19 + VMOVDQU64 (R10), Z20 + VMOVDQU64 (R11), Z21 + VMOVDQU64 (R12), Z22 + VMOVDQU64 (BX), Z23 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z16, Z25, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z17, Z25, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 8 outputs + VMOVDQU64 Z16, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z17, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z18, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z19, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z20, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x8_64Xor_loop + VZEROUPPER + +mulGFNI_2x8_64Xor_end: + RET + +// func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8Xor(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8Xor_loop: + // Load 8 outputs + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8Xor_loop + VZEROUPPER + +mulAvxGFNI_2x8Xor_end: + RET + +// func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x9_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, CX + +mulGFNI_2x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z27, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z23 + VGF2P8AFFINEQB $0x00, Z6, Z27, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z26 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 9 outputs + VMOVDQU64 Z18, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x9_64_loop + VZEROUPPER + +mulGFNI_2x9_64_end: + RET + +// func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9_loop + VZEROUPPER + +mulAvxGFNI_2x9_end: + RET + +// func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x9_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, CX + +mulGFNI_2x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (SI), Z18 + VMOVDQU64 (DI), Z19 + VMOVDQU64 (R8), Z20 + VMOVDQU64 (R9), Z21 + VMOVDQU64 (R10), Z22 + VMOVDQU64 (R11), Z23 + VMOVDQU64 (R12), Z24 + VMOVDQU64 (R13), Z25 + VMOVDQU64 (BX), Z26 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z18, Z28, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z19, Z28, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z20, Z28, Z20 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z21, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z22, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z23, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 9 outputs + VMOVDQU64 Z18, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x9_64Xor_loop + VZEROUPPER + +mulGFNI_2x9_64Xor_end: + RET + +// func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9Xor(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9Xor_loop: + // Load 9 outputs + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9Xor_loop + VZEROUPPER + +mulAvxGFNI_2x9Xor_end: + RET + +// func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x10_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, CX + +mulGFNI_2x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x10_64_loop + VZEROUPPER + +mulGFNI_2x10_64_end: + RET + +// func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10_loop + VZEROUPPER + +mulAvxGFNI_2x10_end: + RET + +// func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_2x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_2x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, CX + +mulGFNI_2x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (SI), Z20 + VMOVDQU64 (DI), Z21 + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (BX), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_2x10_64Xor_loop + VZEROUPPER + +mulGFNI_2x10_64Xor_end: + RET + +// func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10Xor(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10Xor_loop + VZEROUPPER + +mulAvxGFNI_2x10Xor_end: + RET + +// func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulGFNI_3x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z3 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Store 1 outputs + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x1_64_loop + VZEROUPPER + +mulGFNI_3x1_64_end: + RET + +// func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x1_loop + VZEROUPPER + +mulAvxGFNI_3x1_end: + RET + +// func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulGFNI_3x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (SI), Z3 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z4 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z4 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (CX), Z4 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z2, Z4, Z4 + VXORPD Z3, Z4, Z3 + + // Store 1 outputs + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x1_64Xor_loop + VZEROUPPER + +mulGFNI_3x1_64Xor_end: + RET + +// func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1Xor_loop: + // Load 1 outputs + VMOVDQU (SI), Y3 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x1Xor_loop + VZEROUPPER + +mulAvxGFNI_3x1Xor_end: + RET + +// func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulGFNI_3x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z7 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 2 outputs + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x2_64_loop + VZEROUPPER + +mulGFNI_3x2_64_end: + RET + +// func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x2_loop + VZEROUPPER + +mulAvxGFNI_3x2_end: + RET + +// func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulGFNI_3x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (DI), Z6 + VMOVDQU64 (SI), Z7 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z1, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z3, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z8, Z9 + VXORPD Z6, Z9, Z6 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z9 + VXORPD Z7, Z9, Z7 + + // Store 2 outputs + VMOVDQU64 Z6, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z7, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x2_64Xor_loop + VZEROUPPER + +mulGFNI_3x2_64Xor_end: + RET + +// func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2Xor_loop: + // Load 2 outputs + VMOVDQU (DI), Y6 + VMOVDQU (SI), Y7 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x2Xor_loop + VZEROUPPER + +mulAvxGFNI_3x2Xor_end: + RET + +// func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulGFNI_3x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z11 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 3 outputs + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x3_64_loop + VZEROUPPER + +mulGFNI_3x3_64_end: + RET + +// func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x3_loop + VZEROUPPER + +mulAvxGFNI_3x3_end: + RET + +// func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulGFNI_3x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (DI), Z9 + VMOVDQU64 (R8), Z10 + VMOVDQU64 (SI), Z11 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z9, Z13, Z9 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 3 outputs + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z10, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x3_64Xor_loop + VZEROUPPER + +mulGFNI_3x3_64Xor_end: + RET + +// func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3Xor_loop: + // Load 3 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (SI), Y11 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x3Xor_loop + VZEROUPPER + +mulAvxGFNI_3x3Xor_end: + RET + +// func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, CX + +mulGFNI_3x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z15 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 4 outputs + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x4_64_loop + VZEROUPPER + +mulGFNI_3x4_64_end: + RET + +// func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4_loop + VZEROUPPER + +mulAvxGFNI_3x4_end: + RET + +// func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, CX + +mulGFNI_3x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (SI), Z15 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z12, Z17, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z13, Z17, Z13 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 4 outputs + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x4_64Xor_loop + VZEROUPPER + +mulGFNI_3x4_64Xor_end: + RET + +// func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4Xor(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4Xor_loop: + // Load 4 outputs + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4Xor_loop + VZEROUPPER + +mulAvxGFNI_3x4Xor_end: + RET + +// func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, CX + +mulGFNI_3x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z17 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z19 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 5 outputs + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x5_64_loop + VZEROUPPER + +mulGFNI_3x5_64_end: + RET + +// func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5_loop + VZEROUPPER + +mulAvxGFNI_3x5_end: + RET + +// func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, CX + +mulGFNI_3x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (DI), Z15 + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (SI), Z19 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 5 outputs + VMOVDQU64 Z15, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x5_64Xor_loop + VZEROUPPER + +mulGFNI_3x5_64Xor_end: + RET + +// func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5Xor(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5Xor_loop: + // Load 5 outputs + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5Xor_loop + VZEROUPPER + +mulAvxGFNI_3x5Xor_end: + RET + +// func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, CX + +mulGFNI_3x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z23 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 6 outputs + VMOVDQU64 Z18, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z19, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x6_64_loop + VZEROUPPER + +mulGFNI_3x6_64_end: + RET + +// func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6_loop + VZEROUPPER + +mulAvxGFNI_3x6_end: + RET + +// func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, CX + +mulGFNI_3x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (DI), Z18 + VMOVDQU64 (R8), Z19 + VMOVDQU64 (R9), Z20 + VMOVDQU64 (R10), Z21 + VMOVDQU64 (R11), Z22 + VMOVDQU64 (SI), Z23 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z18, Z25, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z19, Z25, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 6 outputs + VMOVDQU64 Z18, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z19, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x6_64Xor_loop + VZEROUPPER + +mulGFNI_3x6_64Xor_end: + RET + +// func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6Xor(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6Xor_loop: + // Load 6 outputs + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6Xor_loop + VZEROUPPER + +mulAvxGFNI_3x6Xor_end: + RET + +// func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x7_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, CX + +mulGFNI_3x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z27 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 7 outputs + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x7_64_loop + VZEROUPPER + +mulGFNI_3x7_64_end: + RET + +// func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7_loop + VZEROUPPER + +mulAvxGFNI_3x7_end: + RET + +// func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x7_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, CX + +mulGFNI_3x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (DI), Z21 + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (SI), Z27 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z21, Z29, Z21 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z22, Z29, Z22 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z23, Z29, Z23 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 7 outputs + VMOVDQU64 Z21, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x7_64Xor_loop + VZEROUPPER + +mulGFNI_3x7_64Xor_end: + RET + +// func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7Xor(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7Xor_loop: + // Load 7 outputs + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7Xor_loop + VZEROUPPER + +mulAvxGFNI_3x7Xor_end: + RET + +// func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x8_64(SB), $0-88 + // Loading 22 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulGFNI_3x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x8_64_loop + VZEROUPPER + +mulGFNI_3x8_64_end: + RET + +// func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8_loop + VZEROUPPER + +mulAvxGFNI_3x8_end: + RET + +// func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x8_64Xor(SB), $0-88 + // Loading 22 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulGFNI_3x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R8), Z22 + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x8_64Xor_loop + VZEROUPPER + +mulGFNI_3x8_64Xor_end: + RET + +// func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8Xor(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8Xor_loop: + // Load 8 outputs + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8Xor_loop + VZEROUPPER + +mulAvxGFNI_3x8Xor_end: + RET + +// func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x9_64(SB), $8-88 + // Loading 21 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulGFNI_3x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x9_64_loop + VZEROUPPER + +mulGFNI_3x9_64_end: + RET + +// func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9_loop + VZEROUPPER + +mulAvxGFNI_3x9_end: + RET + +// func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x9_64Xor(SB), $8-88 + // Loading 21 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulGFNI_3x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_3x9_64Xor_loop + VZEROUPPER + +mulGFNI_3x9_64Xor_end: + RET + +// func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9Xor(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9Xor_loop + VZEROUPPER + +mulAvxGFNI_3x9Xor_end: + RET + +// func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x10_64(SB), $8-88 + // Loading 20 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_3x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_3x10_64_loop + VZEROUPPER + +mulGFNI_3x10_64_end: + RET + +// func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10_loop + VZEROUPPER + +mulAvxGFNI_3x10_end: + RET + +// func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_3x10_64Xor(SB), $8-88 + // Loading 20 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_3x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_3x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (DI), Z20 + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (SI), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + VMOVDQU64 Z20, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_3x10_64Xor_loop + VZEROUPPER + +mulGFNI_3x10_64Xor_end: + RET + +// func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10Xor(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10Xor_loop + VZEROUPPER + +mulAvxGFNI_3x10Xor_end: + RET + +// func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulGFNI_4x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z4 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Store 1 outputs + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x1_64_loop + VZEROUPPER + +mulGFNI_4x1_64_end: + RET + +// func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x1_loop + VZEROUPPER + +mulAvxGFNI_4x1_end: + RET + +// func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulGFNI_4x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (DI), Z4 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z5 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z5 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z5 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (CX), Z5 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z3, Z5, Z5 + VXORPD Z4, Z5, Z4 + + // Store 1 outputs + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x1_64Xor_loop + VZEROUPPER + +mulGFNI_4x1_64Xor_end: + RET + +// func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1Xor_loop: + // Load 1 outputs + VMOVDQU (DI), Y4 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x1Xor_loop + VZEROUPPER + +mulAvxGFNI_4x1Xor_end: + RET + +// func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulGFNI_4x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z9 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 2 outputs + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x2_64_loop + VZEROUPPER + +mulGFNI_4x2_64_end: + RET + +// func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x2_loop + VZEROUPPER + +mulAvxGFNI_4x2_end: + RET + +// func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulGFNI_4x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R8), Z8 + VMOVDQU64 (DI), Z9 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z1, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z3, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z10, Z11 + VXORPD Z8, Z11, Z8 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z11 + VXORPD Z9, Z11, Z9 + + // Store 2 outputs + VMOVDQU64 Z8, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z9, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x2_64Xor_loop + VZEROUPPER + +mulGFNI_4x2_64Xor_end: + RET + +// func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2Xor_loop: + // Load 2 outputs + VMOVDQU (R8), Y8 + VMOVDQU (DI), Y9 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x2Xor_loop + VZEROUPPER + +mulAvxGFNI_4x2Xor_end: + RET + +// func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, CX + +mulGFNI_4x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z14 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z15 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z15 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z10, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z11, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 3 outputs + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x3_64_loop + VZEROUPPER + +mulGFNI_4x3_64_end: + RET + +// func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x3_loop + VZEROUPPER + +mulAvxGFNI_4x3_end: + RET + +// func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, CX + +mulGFNI_4x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R8), Z12 + VMOVDQU64 (R9), Z13 + VMOVDQU64 (DI), Z14 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z15 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z2, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z15 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z4, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z5, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z15 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z8, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (CX), Z15 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z15, Z16 + VXORPD Z12, Z16, Z12 + VGF2P8AFFINEQB $0x00, Z10, Z15, Z16 + VXORPD Z13, Z16, Z13 + VGF2P8AFFINEQB $0x00, Z11, Z15, Z16 + VXORPD Z14, Z16, Z14 + + // Store 3 outputs + VMOVDQU64 Z12, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z14, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x3_64Xor_loop + VZEROUPPER + +mulGFNI_4x3_64Xor_end: + RET + +// func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3Xor(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3Xor_loop: + // Load 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x3Xor_loop + VZEROUPPER + +mulAvxGFNI_4x3Xor_end: + RET + +// func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, CX + +mulGFNI_4x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z19 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 4 outputs + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x4_64_loop + VZEROUPPER + +mulGFNI_4x4_64_end: + RET + +// func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4_loop + VZEROUPPER + +mulAvxGFNI_4x4_end: + RET + +// func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, CX + +mulGFNI_4x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R8), Z16 + VMOVDQU64 (R9), Z17 + VMOVDQU64 (R10), Z18 + VMOVDQU64 (DI), Z19 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 4 outputs + VMOVDQU64 Z16, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z17, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x4_64Xor_loop + VZEROUPPER + +mulGFNI_4x4_64Xor_end: + RET + +// func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4Xor(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4Xor_loop: + // Load 4 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4Xor_loop + VZEROUPPER + +mulAvxGFNI_4x4Xor_end: + RET + +// func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, CX + +mulGFNI_4x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z25 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z25, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z25, Z24 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z25 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z6, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z9, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z25 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (CX), Z25 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z16, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z17, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z18, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z19, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Store 5 outputs + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x5_64_loop + VZEROUPPER + +mulGFNI_4x5_64_end: + RET + +// func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5_loop + VZEROUPPER + +mulAvxGFNI_4x5_end: + RET + +// func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, CX + +mulGFNI_4x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R8), Z20 + VMOVDQU64 (R9), Z21 + VMOVDQU64 (R10), Z22 + VMOVDQU64 (R11), Z23 + VMOVDQU64 (DI), Z24 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z25 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z25 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z6, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z9, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z25 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (CX), Z25 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z25, Z26 + VXORPD Z20, Z26, Z20 + VGF2P8AFFINEQB $0x00, Z16, Z25, Z26 + VXORPD Z21, Z26, Z21 + VGF2P8AFFINEQB $0x00, Z17, Z25, Z26 + VXORPD Z22, Z26, Z22 + VGF2P8AFFINEQB $0x00, Z18, Z25, Z26 + VXORPD Z23, Z26, Z23 + VGF2P8AFFINEQB $0x00, Z19, Z25, Z26 + VXORPD Z24, Z26, Z24 + + // Store 5 outputs + VMOVDQU64 Z20, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z21, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z22, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z23, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z24, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x5_64Xor_loop + VZEROUPPER + +mulGFNI_4x5_64Xor_end: + RET + +// func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5Xor(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5Xor_loop: + // Load 5 outputs + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5Xor_loop + VZEROUPPER + +mulAvxGFNI_4x5Xor_end: + RET + +// func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x6_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, CX + +mulGFNI_4x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x6_64_loop + VZEROUPPER + +mulGFNI_4x6_64_end: + RET + +// func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6_loop + VZEROUPPER + +mulAvxGFNI_4x6_end: + RET + +// func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x6_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, CX + +mulGFNI_4x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R8), Z24 + VMOVDQU64 (R9), Z25 + VMOVDQU64 (R10), Z26 + VMOVDQU64 (R11), Z27 + VMOVDQU64 (R12), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x6_64Xor_loop + VZEROUPPER + +mulGFNI_4x6_64Xor_end: + RET + +// func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6Xor(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6Xor_loop: + // Load 6 outputs + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6Xor_loop + VZEROUPPER + +mulAvxGFNI_4x6Xor_end: + RET + +// func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x7_64(SB), $0-88 + // Loading 23 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulGFNI_4x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x7_64_loop + VZEROUPPER + +mulGFNI_4x7_64_end: + RET + +// func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7_loop + VZEROUPPER + +mulAvxGFNI_4x7_end: + RET + +// func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x7_64Xor(SB), $0-88 + // Loading 23 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulGFNI_4x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R9), Z23 + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x7_64Xor_loop + VZEROUPPER + +mulGFNI_4x7_64Xor_end: + RET + +// func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7Xor(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7Xor_loop: + // Load 7 outputs + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7Xor_loop + VZEROUPPER + +mulAvxGFNI_4x7Xor_end: + RET + +// func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x8_64(SB), $8-88 + // Loading 22 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulGFNI_4x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x8_64_loop + VZEROUPPER + +mulGFNI_4x8_64_end: + RET + +// func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8_loop + VZEROUPPER + +mulAvxGFNI_4x8_end: + RET + +// func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x8_64Xor(SB), $8-88 + // Loading 22 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulGFNI_4x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_4x8_64Xor_loop + VZEROUPPER + +mulGFNI_4x8_64Xor_end: + RET + +// func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8Xor(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8Xor_loop + VZEROUPPER + +mulAvxGFNI_4x8Xor_end: + RET + +// func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x9_64(SB), $8-88 + // Loading 21 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_4x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_4x9_64_loop + VZEROUPPER + +mulGFNI_4x9_64_end: + RET + +// func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9_loop + VZEROUPPER + +mulAvxGFNI_4x9_end: + RET + +// func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x9_64Xor(SB), $8-88 + // Loading 21 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_4x9_64Xor_loop: + // Load 9 outputs + VMOVDQU64 (R8), Z21 + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (DI), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + VMOVDQU64 Z21, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_4x9_64Xor_loop + VZEROUPPER + +mulGFNI_4x9_64Xor_end: + RET + +// func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9Xor(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9Xor_loop + VZEROUPPER + +mulAvxGFNI_4x9Xor_end: + RET + +// func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x10_64(SB), $0-88 + // Loading 20 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z20, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z21, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z22, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z23, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z24, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z25, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z26, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z27, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z28, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z29, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64_loop + VZEROUPPER + +mulGFNI_4x10_64_end: + RET + +// func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvxGFNI_4x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxGFNI_4x10_loop + VZEROUPPER + +mulAvxGFNI_4x10_end: + RET + +// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 + // Loading 20 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU64 (R10)(R9*1), Z20 + MOVQ 24(R8), R10 + VMOVDQU64 (R10)(R9*1), Z21 + MOVQ 48(R8), R10 + VMOVDQU64 (R10)(R9*1), Z22 + MOVQ 72(R8), R10 + VMOVDQU64 (R10)(R9*1), Z23 + MOVQ 96(R8), R10 + VMOVDQU64 (R10)(R9*1), Z24 + MOVQ 120(R8), R10 + VMOVDQU64 (R10)(R9*1), Z25 + MOVQ 144(R8), R10 + VMOVDQU64 (R10)(R9*1), Z26 + MOVQ 168(R8), R10 + VMOVDQU64 (R10)(R9*1), Z27 + MOVQ 192(R8), R10 + VMOVDQU64 (R10)(R9*1), Z28 + MOVQ 216(R8), R10 + VMOVDQU64 (R10)(R9*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z20, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z21, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z22, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z23, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z24, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z25, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z26, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z27, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z28, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z29, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64Xor_loop + VZEROUPPER + +mulGFNI_4x10_64Xor_end: + RET + +// func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10Xor(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvxGFNI_4x10Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU (R10)(R9*1), Y4 + MOVQ 24(R8), R10 + VMOVDQU (R10)(R9*1), Y5 + MOVQ 48(R8), R10 + VMOVDQU (R10)(R9*1), Y6 + MOVQ 72(R8), R10 + VMOVDQU (R10)(R9*1), Y7 + MOVQ 96(R8), R10 + VMOVDQU (R10)(R9*1), Y8 + MOVQ 120(R8), R10 + VMOVDQU (R10)(R9*1), Y9 + MOVQ 144(R8), R10 + VMOVDQU (R10)(R9*1), Y10 + MOVQ 168(R8), R10 + VMOVDQU (R10)(R9*1), Y11 + MOVQ 192(R8), R10 + VMOVDQU (R10)(R9*1), Y12 + MOVQ 216(R8), R10 + VMOVDQU (R10)(R9*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxGFNI_4x10Xor_loop + VZEROUPPER + +mulAvxGFNI_4x10Xor_end: + RET + +// func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulGFNI_5x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Store 1 outputs + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x1_64_loop + VZEROUPPER + +mulGFNI_5x1_64_end: + RET + +// func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x1_loop + VZEROUPPER + +mulAvxGFNI_5x1_end: + RET + +// func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulGFNI_5x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R8), Z5 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z6 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z6 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z6 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z6 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (CX), Z6 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z4, Z6, Z6 + VXORPD Z5, Z6, Z5 + + // Store 1 outputs + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x1_64Xor_loop + VZEROUPPER + +mulGFNI_5x1_64Xor_end: + RET + +// func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1Xor_loop: + // Load 1 outputs + VMOVDQU (R8), Y5 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x1Xor_loop + VZEROUPPER + +mulAvxGFNI_5x1Xor_end: + RET + +// func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulGFNI_5x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z11 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z12 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z12 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z9, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 2 outputs + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x2_64_loop + VZEROUPPER + +mulGFNI_5x2_64_end: + RET + +// func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x2_loop + VZEROUPPER + +mulAvxGFNI_5x2_end: + RET + +// func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulGFNI_5x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R9), Z10 + VMOVDQU64 (R8), Z11 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z12 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z12 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z3, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z12 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z5, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z12 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z7, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (CX), Z12 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z12, Z13 + VXORPD Z10, Z13, Z10 + VGF2P8AFFINEQB $0x00, Z9, Z12, Z13 + VXORPD Z11, Z13, Z11 + + // Store 2 outputs + VMOVDQU64 Z10, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z11, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x2_64Xor_loop + VZEROUPPER + +mulGFNI_5x2_64Xor_end: + RET + +// func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2Xor_loop: + // Load 2 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R8), Y11 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x2Xor_loop + VZEROUPPER + +mulAvxGFNI_5x2Xor_end: + RET + +// func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, CX + +mulGFNI_5x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z17 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 3 outputs + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x3_64_loop + VZEROUPPER + +mulGFNI_5x3_64_end: + RET + +// func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x3_loop + VZEROUPPER + +mulAvxGFNI_5x3_end: + RET + +// func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, CX + +mulGFNI_5x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R9), Z15 + VMOVDQU64 (R10), Z16 + VMOVDQU64 (R8), Z17 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z15, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 3 outputs + VMOVDQU64 Z15, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z16, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z17, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x3_64Xor_loop + VZEROUPPER + +mulGFNI_5x3_64Xor_end: + RET + +// func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3Xor(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3Xor_loop: + // Load 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x3Xor_loop + VZEROUPPER + +mulAvxGFNI_5x3Xor_end: + RET + +// func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, CX + +mulGFNI_5x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z23 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 4 outputs + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x4_64_loop + VZEROUPPER + +mulGFNI_5x4_64_end: + RET + +// func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4_loop + VZEROUPPER + +mulAvxGFNI_5x4_end: + RET + +// func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, CX + +mulGFNI_5x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R9), Z20 + VMOVDQU64 (R10), Z21 + VMOVDQU64 (R11), Z22 + VMOVDQU64 (R8), Z23 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z20, Z25, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 4 outputs + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z21, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z22, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z23, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x4_64Xor_loop + VZEROUPPER + +mulGFNI_5x4_64Xor_end: + RET + +// func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4Xor(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4Xor_loop: + // Load 4 outputs + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4Xor_loop + VZEROUPPER + +mulAvxGFNI_5x4Xor_end: + RET + +// func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x5_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, CX + +mulGFNI_5x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x5_64_loop + VZEROUPPER + +mulGFNI_5x5_64_end: + RET + +// func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5_loop + VZEROUPPER + +mulAvxGFNI_5x5_end: + RET + +// func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x5_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, CX + +mulGFNI_5x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R9), Z25 + VMOVDQU64 (R10), Z26 + VMOVDQU64 (R11), Z27 + VMOVDQU64 (R12), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z26, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z27, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z28, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x5_64Xor_loop + VZEROUPPER + +mulGFNI_5x5_64Xor_end: + RET + +// func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5Xor(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5Xor_loop: + // Load 5 outputs + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5Xor_loop + VZEROUPPER + +mulAvxGFNI_5x5Xor_end: + RET + +// func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x6_64(SB), $0-88 + // Loading 24 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulGFNI_5x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x6_64_loop + VZEROUPPER + +mulGFNI_5x6_64_end: + RET + +// func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6_loop + VZEROUPPER + +mulAvxGFNI_5x6_end: + RET + +// func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x6_64Xor(SB), $0-88 + // Loading 24 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulGFNI_5x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x6_64Xor_loop + VZEROUPPER + +mulGFNI_5x6_64Xor_end: + RET + +// func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6Xor(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6Xor_loop: + // Load 6 outputs + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6Xor_loop + VZEROUPPER + +mulAvxGFNI_5x6Xor_end: + RET + +// func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x7_64(SB), $8-88 + // Loading 23 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulGFNI_5x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x7_64_loop + VZEROUPPER + +mulGFNI_5x7_64_end: + RET + +// func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7_loop + VZEROUPPER + +mulAvxGFNI_5x7_end: + RET + +// func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x7_64Xor(SB), $8-88 + // Loading 23 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulGFNI_5x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_5x7_64Xor_loop + VZEROUPPER + +mulGFNI_5x7_64Xor_end: + RET + +// func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7Xor(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7Xor_loop + VZEROUPPER + +mulAvxGFNI_5x7Xor_end: + RET + +// func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x8_64(SB), $8-88 + // Loading 22 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_5x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_5x8_64_loop + VZEROUPPER + +mulGFNI_5x8_64_end: + RET + +// func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8_loop + VZEROUPPER + +mulAvxGFNI_5x8_end: + RET + +// func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x8_64Xor(SB), $8-88 + // Loading 22 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_5x8_64Xor_loop: + // Load 8 outputs + VMOVDQU64 (R9), Z22 + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R8), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + VMOVDQU64 Z22, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_5x8_64Xor_loop + VZEROUPPER + +mulGFNI_5x8_64Xor_end: + RET + +// func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8Xor(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8Xor_loop + VZEROUPPER + +mulAvxGFNI_5x8Xor_end: + RET + +// func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x9_64(SB), $0-88 + // Loading 21 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x9_64_loop + VZEROUPPER + +mulGFNI_5x9_64_end: + RET + +// func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9_loop + VZEROUPPER + +mulAvxGFNI_5x9_end: + RET + +// func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x9_64Xor(SB), $0-88 + // Loading 21 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x9_64Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU64 (R11)(R10*1), Z21 + MOVQ 24(R9), R11 + VMOVDQU64 (R11)(R10*1), Z22 + MOVQ 48(R9), R11 + VMOVDQU64 (R11)(R10*1), Z23 + MOVQ 72(R9), R11 + VMOVDQU64 (R11)(R10*1), Z24 + MOVQ 96(R9), R11 + VMOVDQU64 (R11)(R10*1), Z25 + MOVQ 120(R9), R11 + VMOVDQU64 (R11)(R10*1), Z26 + MOVQ 144(R9), R11 + VMOVDQU64 (R11)(R10*1), Z27 + MOVQ 168(R9), R11 + VMOVDQU64 (R11)(R10*1), Z28 + MOVQ 192(R9), R11 + VMOVDQU64 (R11)(R10*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x9_64Xor_loop + VZEROUPPER + +mulGFNI_5x9_64Xor_end: + RET + +// func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9Xor(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9Xor_loop + VZEROUPPER + +mulAvxGFNI_5x9Xor_end: + RET + +// func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x10_64(SB), $0-88 + // Loading 20 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU64 Z20, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x10_64_loop + VZEROUPPER + +mulGFNI_5x10_64_end: + RET + +// func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10_loop + VZEROUPPER + +mulAvxGFNI_5x10_end: + RET + +// func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_5x10_64Xor(SB), $0-88 + // Loading 20 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_5x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulGFNI_5x10_64Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU64 (R11)(R10*1), Z20 + MOVQ 24(R9), R11 + VMOVDQU64 (R11)(R10*1), Z21 + MOVQ 48(R9), R11 + VMOVDQU64 (R11)(R10*1), Z22 + MOVQ 72(R9), R11 + VMOVDQU64 (R11)(R10*1), Z23 + MOVQ 96(R9), R11 + VMOVDQU64 (R11)(R10*1), Z24 + MOVQ 120(R9), R11 + VMOVDQU64 (R11)(R10*1), Z25 + MOVQ 144(R9), R11 + VMOVDQU64 (R11)(R10*1), Z26 + MOVQ 168(R9), R11 + VMOVDQU64 (R11)(R10*1), Z27 + MOVQ 192(R9), R11 + VMOVDQU64 (R11)(R10*1), Z28 + MOVQ 216(R9), R11 + VMOVDQU64 (R11)(R10*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU64 Z20, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU64 Z21, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU64 Z22, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU64 Z23, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU64 Z24, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU64 Z25, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU64 Z26, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU64 Z27, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU64 Z28, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU64 Z29, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x40, R10 + DECQ AX + JNZ mulGFNI_5x10_64Xor_loop + VZEROUPPER + +mulGFNI_5x10_64Xor_end: + RET + +// func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10Xor(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y4 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 216(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10Xor_loop + VZEROUPPER + +mulAvxGFNI_5x10Xor_end: + RET + +// func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulGFNI_6x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z6 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Store 1 outputs + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x1_64_loop + VZEROUPPER + +mulGFNI_6x1_64_end: + RET + +// func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x1_loop + VZEROUPPER + +mulAvxGFNI_6x1_end: + RET + +// func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulGFNI_6x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R9), Z6 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z7 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z7 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z7 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z7 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z7 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (CX), Z7 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z5, Z7, Z7 + VXORPD Z6, Z7, Z6 + + // Store 1 outputs + VMOVDQU64 Z6, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x1_64Xor_loop + VZEROUPPER + +mulGFNI_6x1_64Xor_end: + RET + +// func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1Xor_loop: + // Load 1 outputs + VMOVDQU (R9), Y6 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x1Xor_loop + VZEROUPPER + +mulAvxGFNI_6x1Xor_end: + RET + +// func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulGFNI_6x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z14 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z13 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z14 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z14 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z14 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z14 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z11, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 2 outputs + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x2_64_loop + VZEROUPPER + +mulGFNI_6x2_64_end: + RET + +// func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x2_loop + VZEROUPPER + +mulAvxGFNI_6x2_end: + RET + +// func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulGFNI_6x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R10), Z12 + VMOVDQU64 (R9), Z13 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z14 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z1, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z14 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z14 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z5, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z14 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z7, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z14 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z9, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (CX), Z14 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z10, Z14, Z15 + VXORPD Z12, Z15, Z12 + VGF2P8AFFINEQB $0x00, Z11, Z14, Z15 + VXORPD Z13, Z15, Z13 + + // Store 2 outputs + VMOVDQU64 Z12, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z13, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x2_64Xor_loop + VZEROUPPER + +mulGFNI_6x2_64Xor_end: + RET + +// func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2Xor_loop: + // Load 2 outputs + VMOVDQU (R10), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x2Xor_loop + VZEROUPPER + +mulAvxGFNI_6x2Xor_end: + RET + +// func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, CX + +mulGFNI_6x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z20 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z21 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z21 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z21 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z21 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z16, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z17, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 3 outputs + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x3_64_loop + VZEROUPPER + +mulGFNI_6x3_64_end: + RET + +// func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x3_loop + VZEROUPPER + +mulAvxGFNI_6x3_end: + RET + +// func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, CX + +mulGFNI_6x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R10), Z18 + VMOVDQU64 (R11), Z19 + VMOVDQU64 (R9), Z20 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z21 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z2, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z21 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z4, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z5, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z21 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z8, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z21 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z10, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z11, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z21 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z14, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (CX), Z21 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z15, Z21, Z22 + VXORPD Z18, Z22, Z18 + VGF2P8AFFINEQB $0x00, Z16, Z21, Z22 + VXORPD Z19, Z22, Z19 + VGF2P8AFFINEQB $0x00, Z17, Z21, Z22 + VXORPD Z20, Z22, Z20 + + // Store 3 outputs + VMOVDQU64 Z18, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z19, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z20, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x3_64Xor_loop + VZEROUPPER + +mulGFNI_6x3_64Xor_end: + RET + +// func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3Xor(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3Xor_loop: + // Load 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x3Xor_loop + VZEROUPPER + +mulAvxGFNI_6x3Xor_end: + RET + +// func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x4_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, CX + +mulGFNI_6x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z27 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z28 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z28 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z28 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z21, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z22, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z23, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 4 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x4_64_loop + VZEROUPPER + +mulGFNI_6x4_64_end: + RET + +// func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4_loop + VZEROUPPER + +mulAvxGFNI_6x4_end: + RET + +// func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x4_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, CX + +mulGFNI_6x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R10), Z24 + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R9), Z27 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z28 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z28 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z28 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z28 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z28 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z17, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z18, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z19, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (CX), Z28 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z20, Z28, Z29 + VXORPD Z24, Z29, Z24 + VGF2P8AFFINEQB $0x00, Z21, Z28, Z29 + VXORPD Z25, Z29, Z25 + VGF2P8AFFINEQB $0x00, Z22, Z28, Z29 + VXORPD Z26, Z29, Z26 + VGF2P8AFFINEQB $0x00, Z23, Z28, Z29 + VXORPD Z27, Z29, Z27 + + // Store 4 outputs + VMOVDQU64 Z24, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x4_64Xor_loop + VZEROUPPER + +mulGFNI_6x4_64Xor_end: + RET + +// func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4Xor(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4Xor_loop: + // Load 4 outputs + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4Xor_loop + VZEROUPPER + +mulAvxGFNI_6x4Xor_end: + RET + +// func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x5_64(SB), $0-88 + // Loading 25 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulGFNI_6x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x5_64_loop + VZEROUPPER + +mulGFNI_6x5_64_end: + RET + +// func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5_loop + VZEROUPPER + +mulAvxGFNI_6x5_end: + RET + +// func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x5_64Xor(SB), $0-88 + // Loading 25 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulGFNI_6x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R11), Z25 + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x5_64Xor_loop + VZEROUPPER + +mulGFNI_6x5_64Xor_end: + RET + +// func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5Xor(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5Xor_loop: + // Load 5 outputs + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5Xor_loop + VZEROUPPER + +mulAvxGFNI_6x5Xor_end: + RET + +// func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x6_64(SB), $8-88 + // Loading 24 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulGFNI_6x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x6_64_loop + VZEROUPPER + +mulGFNI_6x6_64_end: + RET + +// func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6_loop + VZEROUPPER + +mulAvxGFNI_6x6_end: + RET + +// func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x6_64Xor(SB), $8-88 + // Loading 24 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulGFNI_6x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_6x6_64Xor_loop + VZEROUPPER + +mulGFNI_6x6_64Xor_end: + RET + +// func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6Xor(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6Xor_loop + VZEROUPPER + +mulAvxGFNI_6x6Xor_end: + RET + +// func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x7_64(SB), $8-88 + // Loading 23 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_6x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_6x7_64_loop + VZEROUPPER + +mulGFNI_6x7_64_end: + RET + +// func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7_loop + VZEROUPPER + +mulAvxGFNI_6x7_end: + RET + +// func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x7_64Xor(SB), $8-88 + // Loading 23 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_6x7_64Xor_loop: + // Load 7 outputs + VMOVDQU64 (R10), Z23 + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R9), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_6x7_64Xor_loop + VZEROUPPER + +mulGFNI_6x7_64Xor_end: + RET + +// func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7Xor(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7Xor_loop + VZEROUPPER + +mulAvxGFNI_6x7Xor_end: + RET + +// func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x8_64(SB), $0-88 + // Loading 22 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x8_64_loop + VZEROUPPER + +mulGFNI_6x8_64_end: + RET + +// func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8_loop + VZEROUPPER + +mulAvxGFNI_6x8_end: + RET + +// func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x8_64Xor(SB), $0-88 + // Loading 22 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x8_64Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x8_64Xor_loop + VZEROUPPER + +mulGFNI_6x8_64Xor_end: + RET + +// func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8Xor(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8Xor_loop + VZEROUPPER + +mulAvxGFNI_6x8Xor_end: + RET + +// func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x9_64(SB), $0-88 + // Loading 21 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x9_64_loop + VZEROUPPER + +mulGFNI_6x9_64_end: + RET + +// func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9_loop + VZEROUPPER + +mulAvxGFNI_6x9_end: + RET + +// func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x9_64Xor(SB), $0-88 + // Loading 21 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x9_64Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z21 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 192(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x9_64Xor_loop + VZEROUPPER + +mulGFNI_6x9_64Xor_end: + RET + +// func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9Xor(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9Xor_loop + VZEROUPPER + +mulAvxGFNI_6x9Xor_end: + RET + +// func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x10_64(SB), $0-88 + // Loading 20 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU64 Z20, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x10_64_loop + VZEROUPPER + +mulGFNI_6x10_64_end: + RET + +// func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10_loop + VZEROUPPER + +mulAvxGFNI_6x10_end: + RET + +// func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_6x10_64Xor(SB), $0-88 + // Loading 20 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_6x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulGFNI_6x10_64Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU64 (R12)(R11*1), Z20 + MOVQ 24(R10), R12 + VMOVDQU64 (R12)(R11*1), Z21 + MOVQ 48(R10), R12 + VMOVDQU64 (R12)(R11*1), Z22 + MOVQ 72(R10), R12 + VMOVDQU64 (R12)(R11*1), Z23 + MOVQ 96(R10), R12 + VMOVDQU64 (R12)(R11*1), Z24 + MOVQ 120(R10), R12 + VMOVDQU64 (R12)(R11*1), Z25 + MOVQ 144(R10), R12 + VMOVDQU64 (R12)(R11*1), Z26 + MOVQ 168(R10), R12 + VMOVDQU64 (R12)(R11*1), Z27 + MOVQ 192(R10), R12 + VMOVDQU64 (R12)(R11*1), Z28 + MOVQ 216(R10), R12 + VMOVDQU64 (R12)(R11*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU64 Z20, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU64 Z21, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU64 Z22, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU64 Z23, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU64 Z24, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU64 Z25, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU64 Z26, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU64 Z27, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU64 Z28, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU64 Z29, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x40, R11 + DECQ AX + JNZ mulGFNI_6x10_64Xor_loop + VZEROUPPER + +mulGFNI_6x10_64Xor_end: + RET + +// func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10Xor(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y4 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 216(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10Xor_loop + VZEROUPPER + +mulAvxGFNI_6x10Xor_end: + RET + +// func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulGFNI_7x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z7 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Store 1 outputs + VMOVDQU64 Z7, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x1_64_loop + VZEROUPPER + +mulGFNI_7x1_64_end: + RET + +// func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x1_loop + VZEROUPPER + +mulAvxGFNI_7x1_end: + RET + +// func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulGFNI_7x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R10), Z7 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z8 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z8 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z8 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z8 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z8 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z8 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (CX), Z8 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z6, Z8, Z8 + VXORPD Z7, Z8, Z7 + + // Store 1 outputs + VMOVDQU64 Z7, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x1_64Xor_loop + VZEROUPPER + +mulGFNI_7x1_64Xor_end: + RET + +// func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1Xor_loop: + // Load 1 outputs + VMOVDQU (R10), Y7 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x1Xor_loop + VZEROUPPER + +mulAvxGFNI_7x1Xor_end: + RET + +// func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, CX + +mulGFNI_7x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z15 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z16 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z16 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z16 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z16 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z13, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 2 outputs + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x2_64_loop + VZEROUPPER + +mulGFNI_7x2_64_end: + RET + +// func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x2_loop + VZEROUPPER + +mulAvxGFNI_7x2_end: + RET + +// func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, CX + +mulGFNI_7x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R11), Z14 + VMOVDQU64 (R10), Z15 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z16 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z1, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z16 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z3, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z16 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z16 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z7, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z16 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z9, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z16 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z11, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (CX), Z16 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z12, Z16, Z17 + VXORPD Z14, Z17, Z14 + VGF2P8AFFINEQB $0x00, Z13, Z16, Z17 + VXORPD Z15, Z17, Z15 + + // Store 2 outputs + VMOVDQU64 Z14, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x2_64Xor_loop + VZEROUPPER + +mulGFNI_7x2_64Xor_end: + RET + +// func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2Xor(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2Xor_loop: + // Load 2 outputs + VMOVDQU (R12), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x2Xor_loop + VZEROUPPER + +mulAvxGFNI_7x2Xor_end: + RET + +// func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, CX + +mulGFNI_7x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z23 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z24 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z24 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 3 outputs + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x3_64_loop + VZEROUPPER + +mulGFNI_7x3_64_end: + RET + +// func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x3_loop + VZEROUPPER + +mulAvxGFNI_7x3_end: + RET + +// func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, CX + +mulGFNI_7x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R11), Z21 + VMOVDQU64 (R12), Z22 + VMOVDQU64 (R10), Z23 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z24 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z24 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z4, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z5, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z24 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z7, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z8, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z24 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z24 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z13, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z14, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z24 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z16, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (CX), Z24 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z24, Z25 + VXORPD Z21, Z25, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z24, Z25 + VXORPD Z22, Z25, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z24, Z25 + VXORPD Z23, Z25, Z23 + + // Store 3 outputs + VMOVDQU64 Z21, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z22, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z23, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x3_64Xor_loop + VZEROUPPER + +mulGFNI_7x3_64Xor_end: + RET + +// func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3Xor(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3Xor_loop: + // Load 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x3Xor_loop + VZEROUPPER + +mulAvxGFNI_7x3Xor_end: + RET + +// func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x4_64(SB), $0-88 + // Loading 26 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulGFNI_7x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x4_64_loop + VZEROUPPER + +mulGFNI_7x4_64_end: + RET + +// func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4_loop + VZEROUPPER + +mulAvxGFNI_7x4_end: + RET + +// func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x4_64Xor(SB), $0-88 + // Loading 26 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulGFNI_7x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R12), Z26 + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x4_64Xor_loop + VZEROUPPER + +mulGFNI_7x4_64Xor_end: + RET + +// func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4Xor(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4Xor_loop: + // Load 4 outputs + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4Xor_loop + VZEROUPPER + +mulAvxGFNI_7x4Xor_end: + RET + +// func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x5_64(SB), $8-88 + // Loading 25 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulGFNI_7x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x5_64_loop + VZEROUPPER + +mulGFNI_7x5_64_end: + RET + +// func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5_loop + VZEROUPPER + +mulAvxGFNI_7x5_end: + RET + +// func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x5_64Xor(SB), $8-88 + // Loading 25 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulGFNI_7x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_7x5_64Xor_loop + VZEROUPPER + +mulGFNI_7x5_64Xor_end: + RET + +// func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5Xor(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5Xor_loop + VZEROUPPER + +mulAvxGFNI_7x5Xor_end: + RET + +// func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x6_64(SB), $8-88 + // Loading 24 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64_loop + VZEROUPPER + +mulGFNI_7x6_64_end: + RET + +// func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_7x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_7x6_loop + VZEROUPPER + +mulAvxGFNI_7x6_end: + RET + +// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 + // Loading 24 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R10), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64Xor_loop + VZEROUPPER + +mulGFNI_7x6_64Xor_end: + RET + +// func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6Xor(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_7x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_7x6Xor_loop + VZEROUPPER + +mulAvxGFNI_7x6Xor_end: + RET + +// func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x7_64(SB), $0-88 + // Loading 23 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x7_64_loop + VZEROUPPER + +mulGFNI_7x7_64_end: + RET + +// func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7_loop + VZEROUPPER + +mulAvxGFNI_7x7_end: + RET + +// func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x7_64Xor(SB), $0-88 + // Loading 23 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x7_64Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x7_64Xor_loop + VZEROUPPER + +mulGFNI_7x7_64Xor_end: + RET + +// func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7Xor(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7Xor_loop + VZEROUPPER + +mulAvxGFNI_7x7Xor_end: + RET + +// func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x8_64(SB), $0-88 + // Loading 22 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x8_64_loop + VZEROUPPER + +mulGFNI_7x8_64_end: + RET + +// func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8_loop + VZEROUPPER + +mulAvxGFNI_7x8_end: + RET + +// func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x8_64Xor(SB), $0-88 + // Loading 22 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x8_64Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x8_64Xor_loop + VZEROUPPER + +mulGFNI_7x8_64Xor_end: + RET + +// func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8Xor(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8Xor_loop + VZEROUPPER + +mulAvxGFNI_7x8Xor_end: + RET + +// func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x9_64(SB), $0-88 + // Loading 21 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x9_64_loop + VZEROUPPER + +mulGFNI_7x9_64_end: + RET + +// func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9_loop + VZEROUPPER + +mulAvxGFNI_7x9_end: + RET + +// func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x9_64Xor(SB), $0-88 + // Loading 21 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x9_64Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z21 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 192(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x9_64Xor_loop + VZEROUPPER + +mulGFNI_7x9_64Xor_end: + RET + +// func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9Xor(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9Xor_loop + VZEROUPPER + +mulAvxGFNI_7x9Xor_end: + RET + +// func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x10_64(SB), $0-88 + // Loading 20 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU64 Z20, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x10_64_loop + VZEROUPPER + +mulGFNI_7x10_64_end: + RET + +// func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10_loop + VZEROUPPER + +mulAvxGFNI_7x10_end: + RET + +// func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x10_64Xor(SB), $0-88 + // Loading 20 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulGFNI_7x10_64Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU64 (R13)(R12*1), Z20 + MOVQ 24(R11), R13 + VMOVDQU64 (R13)(R12*1), Z21 + MOVQ 48(R11), R13 + VMOVDQU64 (R13)(R12*1), Z22 + MOVQ 72(R11), R13 + VMOVDQU64 (R13)(R12*1), Z23 + MOVQ 96(R11), R13 + VMOVDQU64 (R13)(R12*1), Z24 + MOVQ 120(R11), R13 + VMOVDQU64 (R13)(R12*1), Z25 + MOVQ 144(R11), R13 + VMOVDQU64 (R13)(R12*1), Z26 + MOVQ 168(R11), R13 + VMOVDQU64 (R13)(R12*1), Z27 + MOVQ 192(R11), R13 + VMOVDQU64 (R13)(R12*1), Z28 + MOVQ 216(R11), R13 + VMOVDQU64 (R13)(R12*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU64 Z20, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU64 Z21, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU64 Z22, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU64 Z23, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU64 Z24, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU64 Z25, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU64 Z26, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU64 Z27, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU64 Z28, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU64 Z29, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x40, R12 + DECQ AX + JNZ mulGFNI_7x10_64Xor_loop + VZEROUPPER + +mulGFNI_7x10_64Xor_end: + RET + +// func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10Xor(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y4 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 216(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10Xor_loop + VZEROUPPER + +mulAvxGFNI_7x10Xor_end: + RET + +// func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64_loop + VZEROUPPER + +mulGFNI_8x1_64_end: + RET + +// func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvxGFNI_8x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x1_loop + VZEROUPPER + +mulAvxGFNI_8x1_end: + RET + +// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R11), Z8 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64Xor_loop + VZEROUPPER + +mulGFNI_8x1_64Xor_end: + RET + +// func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvxGFNI_8x1Xor_loop: + // Load 1 outputs + VMOVDQU (R11), Y8 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x1Xor_loop + VZEROUPPER + +mulAvxGFNI_8x1Xor_end: + RET + +// func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, CX + +mulGFNI_8x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z17 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z18 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z18 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z18 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z15, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 2 outputs + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x2_64_loop + VZEROUPPER + +mulGFNI_8x2_64_end: + RET + +// func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x2_loop + VZEROUPPER + +mulAvxGFNI_8x2_end: + RET + +// func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, CX + +mulGFNI_8x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R12), Z16 + VMOVDQU64 (R11), Z17 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z18 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z1, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z18 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z3, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z18 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z5, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z18 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z18 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z9, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z18 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z11, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z18 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z13, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (CX), Z18 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z14, Z18, Z19 + VXORPD Z16, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z15, Z18, Z19 + VXORPD Z17, Z19, Z17 + + // Store 2 outputs + VMOVDQU64 Z16, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z17, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x2_64Xor_loop + VZEROUPPER + +mulGFNI_8x2_64Xor_end: + RET + +// func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2Xor(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2Xor_loop: + // Load 2 outputs + VMOVDQU (R13), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x2Xor_loop + VZEROUPPER + +mulAvxGFNI_8x2Xor_end: + RET + +// func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, CX + +mulGFNI_8x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z26 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z27 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z27 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z27 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z27 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z27 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z27 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z22, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z23, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 3 outputs + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x3_64_loop + VZEROUPPER + +mulGFNI_8x3_64_end: + RET + +// func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x3_loop + VZEROUPPER + +mulAvxGFNI_8x3_end: + RET + +// func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, CX + +mulGFNI_8x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R12), Z24 + VMOVDQU64 (R13), Z25 + VMOVDQU64 (R11), Z26 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z27 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z27 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z27 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z27 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z10, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z27 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z27 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z27 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (CX), Z27 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z21, Z27, Z28 + VXORPD Z24, Z28, Z24 + VGF2P8AFFINEQB $0x00, Z22, Z27, Z28 + VXORPD Z25, Z28, Z25 + VGF2P8AFFINEQB $0x00, Z23, Z27, Z28 + VXORPD Z26, Z28, Z26 + + // Store 3 outputs + VMOVDQU64 Z24, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z25, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z26, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x3_64Xor_loop + VZEROUPPER + +mulGFNI_8x3_64Xor_end: + RET + +// func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3Xor(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3Xor_loop: + // Load 3 outputs + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x3Xor_loop + VZEROUPPER + +mulAvxGFNI_8x3Xor_end: + RET + +// func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x4_64(SB), $8-88 + // Loading 26 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulGFNI_8x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x4_64_loop + VZEROUPPER + +mulGFNI_8x4_64_end: + RET + +// func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4_loop + VZEROUPPER + +mulAvxGFNI_8x4_end: + RET + +// func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x4_64Xor(SB), $8-88 + // Loading 26 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulGFNI_8x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x4_64Xor_loop + VZEROUPPER + +mulGFNI_8x4_64Xor_end: + RET + +// func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4Xor(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4Xor_loop + VZEROUPPER + +mulAvxGFNI_8x4Xor_end: + RET + +// func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x5_64(SB), $8-88 + // Loading 25 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_8x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_8x5_64_loop + VZEROUPPER + +mulGFNI_8x5_64_end: + RET + +// func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5_loop + VZEROUPPER + +mulAvxGFNI_8x5_end: + RET + +// func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x5_64Xor(SB), $8-88 + // Loading 25 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_8x5_64Xor_loop: + // Load 5 outputs + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R11), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_8x5_64Xor_loop + VZEROUPPER + +mulGFNI_8x5_64Xor_end: + RET + +// func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5Xor(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5Xor_loop + VZEROUPPER + +mulAvxGFNI_8x5Xor_end: + RET + +// func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x6_64(SB), $0-88 + // Loading 24 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x6_64_loop + VZEROUPPER + +mulGFNI_8x6_64_end: + RET + +// func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6_loop + VZEROUPPER + +mulAvxGFNI_8x6_end: + RET + +// func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x6_64Xor(SB), $0-88 + // Loading 24 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x6_64Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x6_64Xor_loop + VZEROUPPER + +mulGFNI_8x6_64Xor_end: + RET + +// func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6Xor(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6Xor_loop + VZEROUPPER + +mulAvxGFNI_8x6Xor_end: + RET + +// func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x7_64(SB), $0-88 + // Loading 23 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x7_64_loop + VZEROUPPER + +mulGFNI_8x7_64_end: + RET + +// func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7_loop + VZEROUPPER + +mulAvxGFNI_8x7_end: + RET + +// func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x7_64Xor(SB), $0-88 + // Loading 23 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x7_64Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x7_64Xor_loop + VZEROUPPER + +mulGFNI_8x7_64Xor_end: + RET + +// func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7Xor(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7Xor_loop + VZEROUPPER + +mulAvxGFNI_8x7Xor_end: + RET + +// func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x8_64(SB), $0-88 + // Loading 22 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x8_64_loop + VZEROUPPER + +mulGFNI_8x8_64_end: + RET + +// func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8_loop + VZEROUPPER + +mulAvxGFNI_8x8_end: + RET + +// func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x8_64Xor(SB), $0-88 + // Loading 22 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x8_64Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x8_64Xor_loop + VZEROUPPER + +mulGFNI_8x8_64Xor_end: + RET + +// func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8Xor(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8Xor_loop + VZEROUPPER + +mulAvxGFNI_8x8Xor_end: + RET + +// func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x9_64(SB), $0-88 + // Loading 21 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x9_64_loop + VZEROUPPER + +mulGFNI_8x9_64_end: + RET + +// func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9_loop + VZEROUPPER + +mulAvxGFNI_8x9_end: + RET + +// func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x9_64Xor(SB), $0-88 + // Loading 21 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x9_64Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z21 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 192(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x9_64Xor_loop + VZEROUPPER + +mulGFNI_8x9_64Xor_end: + RET + +// func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9Xor(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9Xor_loop + VZEROUPPER + +mulAvxGFNI_8x9Xor_end: + RET + +// func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x10_64(SB), $0-88 + // Loading 20 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU64 Z20, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x10_64_loop + VZEROUPPER + +mulGFNI_8x10_64_end: + RET + +// func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10_loop + VZEROUPPER + +mulAvxGFNI_8x10_end: + RET + +// func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x10_64Xor(SB), $0-88 + // Loading 20 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulGFNI_8x10_64Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU64 (R14)(R13*1), Z20 + MOVQ 24(R12), R14 + VMOVDQU64 (R14)(R13*1), Z21 + MOVQ 48(R12), R14 + VMOVDQU64 (R14)(R13*1), Z22 + MOVQ 72(R12), R14 + VMOVDQU64 (R14)(R13*1), Z23 + MOVQ 96(R12), R14 + VMOVDQU64 (R14)(R13*1), Z24 + MOVQ 120(R12), R14 + VMOVDQU64 (R14)(R13*1), Z25 + MOVQ 144(R12), R14 + VMOVDQU64 (R14)(R13*1), Z26 + MOVQ 168(R12), R14 + VMOVDQU64 (R14)(R13*1), Z27 + MOVQ 192(R12), R14 + VMOVDQU64 (R14)(R13*1), Z28 + MOVQ 216(R12), R14 + VMOVDQU64 (R14)(R13*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU64 Z20, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU64 Z21, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU64 Z22, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU64 Z23, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU64 Z24, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU64 Z25, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU64 Z26, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU64 Z27, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU64 Z28, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU64 Z29, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x40, R13 + DECQ AX + JNZ mulGFNI_8x10_64Xor_loop + VZEROUPPER + +mulGFNI_8x10_64Xor_end: + RET + +// func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10Xor(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y4 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 216(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10Xor_loop + VZEROUPPER + +mulAvxGFNI_8x10Xor_end: + RET + +// func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulGFNI_9x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z9 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Store 1 outputs + VMOVDQU64 Z9, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x1_64_loop + VZEROUPPER + +mulGFNI_9x1_64_end: + RET + +// func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x1_loop + VZEROUPPER + +mulAvxGFNI_9x1_end: + RET + +// func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulGFNI_9x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R12), Z9 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z10 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z10 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z10 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z10 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z10 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z10 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z10 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z10 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (CX), Z10 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z8, Z10, Z10 + VXORPD Z9, Z10, Z9 + + // Store 1 outputs + VMOVDQU64 Z9, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x1_64Xor_loop + VZEROUPPER + +mulGFNI_9x1_64Xor_end: + RET + +// func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1Xor_loop: + // Load 1 outputs + VMOVDQU (R12), Y9 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x1Xor_loop + VZEROUPPER + +mulAvxGFNI_9x1Xor_end: + RET + +// func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, CX + +mulGFNI_9x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z19 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z20 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z20 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z20 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z20 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z20 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 2 outputs + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x2_64_loop + VZEROUPPER + +mulGFNI_9x2_64_end: + RET + +// func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x2_loop + VZEROUPPER + +mulAvxGFNI_9x2_end: + RET + +// func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, CX + +mulGFNI_9x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R13), Z18 + VMOVDQU64 (R12), Z19 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z20 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z20 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z20 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z20 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z20 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z20 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z11, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z20 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z13, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z20 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z15, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z16, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z17, Z20, Z21 + VXORPD Z19, Z21, Z19 + + // Store 2 outputs + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x2_64Xor_loop + VZEROUPPER + +mulGFNI_9x2_64Xor_end: + RET + +// func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2Xor(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2Xor_loop: + // Load 2 outputs + VMOVDQU (R14), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x2Xor_loop + VZEROUPPER + +mulAvxGFNI_9x2Xor_end: + RET + +// func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x3_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64_loop + VZEROUPPER + +mulGFNI_9x3_64_end: + RET + +// func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvxGFNI_9x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x3_loop + VZEROUPPER + +mulAvxGFNI_9x3_end: + RET + +// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64Xor_loop + VZEROUPPER + +mulGFNI_9x3_64Xor_end: + RET + +// func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3Xor(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvxGFNI_9x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x3Xor_loop + VZEROUPPER + +mulAvxGFNI_9x3Xor_end: + RET + +// func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x4_64(SB), $8-88 + // Loading 26 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_9x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_9x4_64_loop + VZEROUPPER + +mulGFNI_9x4_64_end: + RET + +// func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4_loop + VZEROUPPER + +mulAvxGFNI_9x4_end: + RET + +// func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x4_64Xor(SB), $8-88 + // Loading 26 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_9x4_64Xor_loop: + // Load 4 outputs + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R12), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_9x4_64Xor_loop + VZEROUPPER + +mulGFNI_9x4_64Xor_end: + RET + +// func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4Xor(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4Xor_loop + VZEROUPPER + +mulAvxGFNI_9x4Xor_end: + RET + +// func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x5_64(SB), $0-88 + // Loading 25 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x5_64_loop + VZEROUPPER + +mulGFNI_9x5_64_end: + RET + +// func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5_loop + VZEROUPPER + +mulAvxGFNI_9x5_end: + RET + +// func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x5_64Xor(SB), $0-88 + // Loading 25 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x5_64Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x5_64Xor_loop + VZEROUPPER + +mulGFNI_9x5_64Xor_end: + RET + +// func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5Xor(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5Xor_loop + VZEROUPPER + +mulAvxGFNI_9x5Xor_end: + RET + +// func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x6_64(SB), $0-88 + // Loading 24 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x6_64_loop + VZEROUPPER + +mulGFNI_9x6_64_end: + RET + +// func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6_loop + VZEROUPPER + +mulAvxGFNI_9x6_end: + RET + +// func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x6_64Xor(SB), $0-88 + // Loading 24 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x6_64Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x6_64Xor_loop + VZEROUPPER + +mulGFNI_9x6_64Xor_end: + RET + +// func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6Xor(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6Xor_loop + VZEROUPPER + +mulAvxGFNI_9x6Xor_end: + RET + +// func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x7_64(SB), $0-88 + // Loading 23 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x7_64_loop + VZEROUPPER + +mulGFNI_9x7_64_end: + RET + +// func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7_loop + VZEROUPPER + +mulAvxGFNI_9x7_end: + RET + +// func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x7_64Xor(SB), $0-88 + // Loading 23 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x7_64Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x7_64Xor_loop + VZEROUPPER + +mulGFNI_9x7_64Xor_end: + RET + +// func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7Xor(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7Xor_loop + VZEROUPPER + +mulAvxGFNI_9x7Xor_end: + RET + +// func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x8_64(SB), $0-88 + // Loading 22 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x8_64_loop + VZEROUPPER + +mulGFNI_9x8_64_end: + RET + +// func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8_loop + VZEROUPPER + +mulAvxGFNI_9x8_end: + RET + +// func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x8_64Xor(SB), $0-88 + // Loading 22 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x8_64Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x8_64Xor_loop + VZEROUPPER + +mulGFNI_9x8_64Xor_end: + RET + +// func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8Xor(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8Xor_loop + VZEROUPPER + +mulAvxGFNI_9x8Xor_end: + RET + +// func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x9_64(SB), $0-88 + // Loading 21 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x9_64_loop + VZEROUPPER + +mulGFNI_9x9_64_end: + RET + +// func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9_loop + VZEROUPPER + +mulAvxGFNI_9x9_end: + RET + +// func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x9_64Xor(SB), $0-88 + // Loading 21 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x9_64Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z21 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 192(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x9_64Xor_loop + VZEROUPPER + +mulGFNI_9x9_64Xor_end: + RET + +// func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9Xor(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9Xor_loop + VZEROUPPER + +mulAvxGFNI_9x9Xor_end: + RET + +// func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x10_64(SB), $0-88 + // Loading 20 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU64 Z20, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x10_64_loop + VZEROUPPER + +mulGFNI_9x10_64_end: + RET + +// func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10_loop + VZEROUPPER + +mulAvxGFNI_9x10_end: + RET + +// func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x10_64Xor(SB), $0-88 + // Loading 20 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulGFNI_9x10_64Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU64 (R15)(R14*1), Z20 + MOVQ 24(R13), R15 + VMOVDQU64 (R15)(R14*1), Z21 + MOVQ 48(R13), R15 + VMOVDQU64 (R15)(R14*1), Z22 + MOVQ 72(R13), R15 + VMOVDQU64 (R15)(R14*1), Z23 + MOVQ 96(R13), R15 + VMOVDQU64 (R15)(R14*1), Z24 + MOVQ 120(R13), R15 + VMOVDQU64 (R15)(R14*1), Z25 + MOVQ 144(R13), R15 + VMOVDQU64 (R15)(R14*1), Z26 + MOVQ 168(R13), R15 + VMOVDQU64 (R15)(R14*1), Z27 + MOVQ 192(R13), R15 + VMOVDQU64 (R15)(R14*1), Z28 + MOVQ 216(R13), R15 + VMOVDQU64 (R15)(R14*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU64 Z20, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU64 Z21, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU64 Z22, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU64 Z23, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU64 Z24, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU64 Z25, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU64 Z26, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU64 Z27, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU64 Z28, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU64 Z29, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x40, R14 + DECQ AX + JNZ mulGFNI_9x10_64Xor_loop + VZEROUPPER + +mulGFNI_9x10_64Xor_end: + RET + +// func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10Xor(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y4 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 216(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10Xor_loop + VZEROUPPER + +mulAvxGFNI_9x10Xor_end: + RET + +// func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x1_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x1_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulGFNI_10x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z11 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z10 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z11 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z11 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z11 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z11 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z11 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z11 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z11 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (R12), Z11 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z8, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Store 1 outputs + VMOVDQU64 Z10, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x1_64_loop + VZEROUPPER + +mulGFNI_10x1_64_end: + RET + +// func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x1_loop + VZEROUPPER + +mulAvxGFNI_10x1_end: + RET + +// func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulGFNI_10x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R13), Z10 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z11 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z11 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z11 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z11 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z11 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z11 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z11 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (R11), Z11 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z7, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU64 (R12), Z11 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z8, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU64 (CX), Z11 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z9, Z11, Z11 + VXORPD Z10, Z11, Z10 + + // Store 1 outputs + VMOVDQU64 Z10, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x1_64Xor_loop + VZEROUPPER + +mulGFNI_10x1_64Xor_end: + RET + +// func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1Xor_loop: + // Load 1 outputs + VMOVDQU (R13), Y10 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x1Xor_loop + VZEROUPPER + +mulAvxGFNI_10x1Xor_end: + RET + +// func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x2_64(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x2_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, CX + +mulGFNI_10x2_64_loop: + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z22 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z22, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z22, Z21 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z22 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z22 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z22 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z7, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z22 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z22 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z22 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z22 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (R12), Z22 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z16, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU64 (CX), Z22 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z19, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Store 2 outputs + VMOVDQU64 Z20, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z21, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x2_64_loop + VZEROUPPER + +mulGFNI_10x2_64_end: + RET + +// func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x2_loop + VZEROUPPER + +mulAvxGFNI_10x2_end: + RET + +// func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x2_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x2_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, CX + +mulGFNI_10x2_64Xor_loop: + // Load 2 outputs + VMOVDQU64 (R14), Z20 + VMOVDQU64 (R13), Z21 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU64 (DX), Z22 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU64 (BX), Z22 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z2, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z3, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU64 (SI), Z22 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z5, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU64 (DI), Z22 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z6, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z7, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU64 (R8), Z22 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z8, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z9, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU64 (R9), Z22 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z10, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU64 (R10), Z22 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z12, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z13, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU64 (R11), Z22 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z14, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z15, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU64 (R12), Z22 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z16, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z17, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU64 (CX), Z22 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z18, Z22, Z23 + VXORPD Z20, Z23, Z20 + VGF2P8AFFINEQB $0x00, Z19, Z22, Z23 + VXORPD Z21, Z23, Z21 + + // Store 2 outputs + VMOVDQU64 Z20, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z21, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_10x2_64Xor_loop + VZEROUPPER + +mulGFNI_10x2_64Xor_end: + RET + +// func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2Xor(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2Xor_loop: + // Load 2 outputs + VMOVDQU (R15), Y12 + VMOVDQU (R14), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x2Xor_loop + VZEROUPPER + +mulAvxGFNI_10x2Xor_end: + RET + +// func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x3_64(SB), $8-88 + // Loading 27 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64_loop + VZEROUPPER + +mulGFNI_10x3_64_end: + RET + +// func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_10x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_10x3_loop + VZEROUPPER + +mulAvxGFNI_10x3_end: + RET + +// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 + // Loading 27 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R13), Z29 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64Xor_loop + VZEROUPPER + +mulGFNI_10x3_64Xor_end: + RET + +// func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3Xor(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_10x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_10x3Xor_loop + VZEROUPPER + +mulAvxGFNI_10x3Xor_end: + RET + +// func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x4_64(SB), $8-88 + // Loading 26 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x4_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x4_64_loop: + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x4_64_loop + VZEROUPPER + +mulGFNI_10x4_64_end: + RET + +// func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4_loop + VZEROUPPER + +mulAvxGFNI_10x4_end: + RET + +// func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x4_64Xor(SB), $8-88 + // Loading 26 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x4_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x4_64Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 4 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 4 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 4 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 4 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 4 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 4 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 4 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 4 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 4 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 4 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x4_64Xor_loop + VZEROUPPER + +mulGFNI_10x4_64Xor_end: + RET + +// func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4Xor(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4Xor_loop + VZEROUPPER + +mulAvxGFNI_10x4Xor_end: + RET + +// func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x5_64(SB), $8-88 + // Loading 25 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x5_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x5_64_loop: + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x5_64_loop + VZEROUPPER + +mulGFNI_10x5_64_end: + RET + +// func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5_loop + VZEROUPPER + +mulAvxGFNI_10x5_end: + RET + +// func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x5_64Xor(SB), $8-88 + // Loading 25 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x5_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x5_64Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 5 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 5 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 5 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 5 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 5 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 5 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 5 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 5 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 5 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 5 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x5_64Xor_loop + VZEROUPPER + +mulGFNI_10x5_64Xor_end: + RET + +// func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5Xor(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5Xor_loop + VZEROUPPER + +mulAvxGFNI_10x5Xor_end: + RET + +// func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x6_64(SB), $8-88 + // Loading 24 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x6_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x6_64_loop + VZEROUPPER + +mulGFNI_10x6_64_end: + RET + +// func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6_loop + VZEROUPPER + +mulAvxGFNI_10x6_end: + RET + +// func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x6_64Xor(SB), $8-88 + // Loading 24 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x6_64Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 6 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 6 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x6_64Xor_loop + VZEROUPPER + +mulGFNI_10x6_64Xor_end: + RET + +// func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6Xor(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6Xor_loop + VZEROUPPER + +mulAvxGFNI_10x6Xor_end: + RET + +// func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x7_64(SB), $8-88 + // Loading 23 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x7_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x7_64_loop: + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x7_64_loop + VZEROUPPER + +mulGFNI_10x7_64_end: + RET + +// func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7_loop + VZEROUPPER + +mulAvxGFNI_10x7_end: + RET + +// func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x7_64Xor(SB), $8-88 + // Loading 23 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x7_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x7_64Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 7 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 7 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 7 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 7 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 7 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 7 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 7 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 7 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 7 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 7 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x7_64Xor_loop + VZEROUPPER + +mulGFNI_10x7_64Xor_end: + RET + +// func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7Xor(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7Xor_loop + VZEROUPPER + +mulAvxGFNI_10x7Xor_end: + RET + +// func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x8_64(SB), $8-88 + // Loading 22 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x8_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x8_64_loop: + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x8_64_loop + VZEROUPPER + +mulGFNI_10x8_64_end: + RET + +// func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8_loop + VZEROUPPER + +mulAvxGFNI_10x8_end: + RET + +// func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x8_64Xor(SB), $8-88 + // Loading 22 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x8_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x8_64Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 8 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 8 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 8 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 8 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 8 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 8 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 8 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 8 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 8 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 8 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x8_64Xor_loop + VZEROUPPER + +mulGFNI_10x8_64Xor_end: + RET + +// func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8Xor(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8Xor_loop + VZEROUPPER + +mulAvxGFNI_10x8Xor_end: + RET + +// func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x9_64(SB), $8-88 + // Loading 21 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x9_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x9_64_loop: + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x9_64_loop + VZEROUPPER + +mulGFNI_10x9_64_end: + RET + +// func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9_loop + VZEROUPPER + +mulAvxGFNI_10x9_end: + RET + +// func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x9_64Xor(SB), $8-88 + // Loading 21 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x9_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x9_64Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z21 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 192(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 9 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 9 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 9 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 9 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 9 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 9 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 9 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 9 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 9 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 9 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x9_64Xor_loop + VZEROUPPER + +mulGFNI_10x9_64Xor_end: + RET + +// func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9Xor(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9Xor_loop + VZEROUPPER + +mulAvxGFNI_10x9Xor_end: + RET + +// func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x10_64(SB), $8-88 + // Loading 20 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x10_64_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x10_64_loop: + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU64 Z20, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x10_64_loop + VZEROUPPER + +mulGFNI_10x10_64_end: + RET + +// func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10_loop + VZEROUPPER + +mulAvxGFNI_10x10_end: + RET + +// func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x10_64Xor(SB), $8-88 + // Loading 20 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulGFNI_10x10_64Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU64 (BP)(R15*1), Z20 + MOVQ 24(R14), BP + VMOVDQU64 (BP)(R15*1), Z21 + MOVQ 48(R14), BP + VMOVDQU64 (BP)(R15*1), Z22 + MOVQ 72(R14), BP + VMOVDQU64 (BP)(R15*1), Z23 + MOVQ 96(R14), BP + VMOVDQU64 (BP)(R15*1), Z24 + MOVQ 120(R14), BP + VMOVDQU64 (BP)(R15*1), Z25 + MOVQ 144(R14), BP + VMOVDQU64 (BP)(R15*1), Z26 + MOVQ 168(R14), BP + VMOVDQU64 (BP)(R15*1), Z27 + MOVQ 192(R14), BP + VMOVDQU64 (BP)(R15*1), Z28 + MOVQ 216(R14), BP + VMOVDQU64 (BP)(R15*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 10 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 336(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 344(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 352(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 360(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 368(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 376(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 384(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 392(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 10 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB.BCST $0x00, 400(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 408(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 416(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 424(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 432(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 440(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 448(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 456(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 464(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 472(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 10 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB.BCST $0x00, 480(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 488(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 496(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 504(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 512(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 520(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 528(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 536(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 544(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 552(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 10 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB.BCST $0x00, 560(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 568(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 576(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 584(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 592(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 600(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 608(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 616(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 624(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 632(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 10 outputs + VMOVDQU64 (R13), Z30 + ADDQ $0x40, R13 + VGF2P8AFFINEQB.BCST $0x00, 640(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 648(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 656(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 664(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 672(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 680(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 688(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 696(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 704(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 712(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 720(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 728(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 736(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 744(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 752(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 760(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 768(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 776(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 784(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 792(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU64 Z20, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU64 Z21, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU64 Z22, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU64 Z23, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU64 Z24, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU64 Z25, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU64 Z26, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU64 Z27, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU64 Z28, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU64 Z29, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x40, R15 + DECQ AX + JNZ mulGFNI_10x10_64Xor_loop + VZEROUPPER + +mulGFNI_10x10_64Xor_end: + RET + +// func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10Xor(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y4 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 216(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10Xor_loop + VZEROUPPER + +mulAvxGFNI_10x10Xor_end: + RET + +// func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_0(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + VBROADCASTF32X2 t02+48(FP), Z2 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z3 + VMOVDQU64 (DI), Z4 + VMOVDQU64 (R8), Z5 + VMOVDQU64 (AX), Z6 + VXORPD Z4, Z3, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 + VXORPD Z3, Z7, Z3 + VXORPD Z5, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VPTERNLOGD $0x96, Z7, Z3, Z5 + VXORPD Z4, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 + VXORPD Z3, Z7, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z6, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_0(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + VBROADCASTF32X2 t02+48(FP), Z2 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z3 + VMOVDQU64 (DI), Z4 + VMOVDQU64 (R8), Z5 + VMOVDQU64 (AX), Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z5, Z7 + VXORPD Z3, Z7, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z2, Z6, Z7 + VXORPD Z4, Z7, Z4 + VXORPD Z3, Z5, Z5 + VXORPD Z4, Z6, Z6 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z7 + VXORPD Z3, Z7, Z3 + VXORPD Z4, Z3, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z6, Z7 + VXORPD Z5, Z7, Z5 + VXORPD Z5, Z6, Z6 + VMOVDQU64 Z3, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z4, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z5, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z6, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_1(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + VXORPD Z4, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 + VPTERNLOGD $0x96, Z6, Z2, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_1(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_1(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z4, Z6, Z4 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_2(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z4, Z5, Z5 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_2(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_2(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z5, Z6 + VXORPD Z4, Z6, Z4 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_3(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t02+48(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 + VXORPD Z1, Z5, Z1 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_3(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_3(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z2, Z1, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z3, Z5, Z3 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_4(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t23+40(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + VXORPD Z3, Z2, Z3 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z4, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VPTERNLOGD $0x96, Z6, Z2, Z4 + VXORPD Z3, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_4(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_4(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + VBROADCASTF32X2 t02+48(FP), Z1 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z2 + VMOVDQU64 (DI), Z3 + VMOVDQU64 (R8), Z4 + VMOVDQU64 (AX), Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z4, Z6 + VXORPD Z2, Z6, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z1, Z5, Z6 + VXORPD Z3, Z6, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z3, Z5, Z5 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z6 + VXORPD Z2, Z6, Z2 + VXORPD Z3, Z2, Z3 + VXORPD Z4, Z5, Z5 + VMOVDQU64 Z2, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z3, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z4, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z5, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_5(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t23+40(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VPTERNLOGD $0x96, Z5, Z1, Z3 + VXORPD Z2, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_5(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_5(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 + VXORPD Z1, Z5, Z1 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·ifftDIT48_gfni_6(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t01+32(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + VXORPD Z2, Z1, Z2 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z2, Z5 + VXORPD Z1, Z5, Z1 + VXORPD Z3, Z4, Z4 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_6(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·fftDIT48_gfni_6(SB), NOSPLIT, $0-56 + VBROADCASTF32X2 t02+48(FP), Z0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z1 + VMOVDQU64 (DI), Z2 + VMOVDQU64 (R8), Z3 + VMOVDQU64 (AX), Z4 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z3, Z5 + VXORPD Z1, Z5, Z1 + + // LEO_MULADD_512 + VGF2P8AFFINEQB $0x00, Z0, Z4, Z5 + VXORPD Z2, Z5, Z2 + VXORPD Z1, Z3, Z3 + VXORPD Z2, Z4, Z4 + VXORPD Z2, Z1, Z2 + VXORPD Z3, Z4, Z4 + VMOVDQU64 Z1, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z2, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z3, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z4, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func ifftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·ifftDIT48_gfni_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z0 + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R8), Z2 + VMOVDQU64 (AX), Z3 + VXORPD Z1, Z0, Z1 + VXORPD Z2, Z3, Z3 + VXORPD Z0, Z2, Z2 + VXORPD Z1, Z3, Z3 + VMOVDQU64 Z0, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z1, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z3, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET + +// func fftDIT48_gfni_7(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) +// Requires: AVX, AVX512DQ, AVX512F +TEXT ·fftDIT48_gfni_7(SB), NOSPLIT, $0-56 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), CX + MOVQ 8(CX), DX + XORQ BX, BX + MOVQ (CX)(BX*1), SI + ADDQ AX, BX + MOVQ (CX)(BX*1), DI + ADDQ AX, BX + MOVQ (CX)(BX*1), R8 + ADDQ AX, BX + MOVQ (CX)(BX*1), AX + +loop: + VMOVDQU64 (SI), Z0 + VMOVDQU64 (DI), Z1 + VMOVDQU64 (R8), Z2 + VMOVDQU64 (AX), Z3 + VXORPD Z0, Z2, Z2 + VXORPD Z1, Z3, Z3 + VXORPD Z1, Z0, Z1 + VXORPD Z2, Z3, Z3 + VMOVDQU64 Z0, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z1, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z2, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z3, (AX) + ADDQ $0x40, AX + SUBQ $0x40, DX + JA loop + VZEROUPPER + RET diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go new file mode 100644 index 000000000..429e2c20d --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go @@ -0,0 +1,2045 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !nogen && !nopshufb +// +build !appengine,!noasm,gc,!nogen,!nopshufb + +package reedsolomon + +import ( + "fmt" +) + +const ( + avx2CodeGen = true + maxAvx2Inputs = 10 + maxAvx2Outputs = 10 + minAvx2Size = 64 +) + +func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { + n := stop - start + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxTwo_1x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_1x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_1x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_1x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_1x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_1x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_1x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_1x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_1x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_1x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 2: + switch len(out) { + case 1: + mulAvxTwo_2x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_2x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_2x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_2x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_2x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_2x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_2x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_2x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_2x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_2x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 3: + switch len(out) { + case 1: + mulAvxTwo_3x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_3x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_3x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_3x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_3x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_3x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_3x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_3x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_3x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_3x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 4: + switch len(out) { + case 1: + mulAvxTwo_4x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_4x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_4x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_4x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_4x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_4x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_4x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_4x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_4x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_4x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 5: + switch len(out) { + case 1: + mulAvxTwo_5x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_5x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_5x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_5x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_5x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_5x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_5x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_5x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_5x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_5x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 6: + switch len(out) { + case 1: + mulAvxTwo_6x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_6x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_6x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_6x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_6x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_6x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_6x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_6x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_6x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_6x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 7: + switch len(out) { + case 1: + mulAvxTwo_7x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_7x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_7x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_7x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_7x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_7x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_7x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_7x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_7x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_7x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 8: + switch len(out) { + case 1: + mulAvxTwo_8x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_8x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_8x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_8x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_8x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_8x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_8x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_8x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_8x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_8x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 9: + switch len(out) { + case 1: + mulAvxTwo_9x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_9x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_9x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_9x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_9x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_9x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_9x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_9x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_9x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_9x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 10: + switch len(out) { + case 1: + mulAvxTwo_10x1_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_10x2_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_10x3_64(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_10x4(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_10x5(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_10x6(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_10x7(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_10x8(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_10x9(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_10x10(matrix, in, out, start, n) + return n & (maxInt - 31) + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { + n := (stop - start) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxTwo_1x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_1x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_1x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_1x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_1x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_1x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_1x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_1x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_1x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_1x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 2: + switch len(out) { + case 1: + mulAvxTwo_2x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_2x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_2x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_2x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_2x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_2x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_2x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_2x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_2x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_2x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 3: + switch len(out) { + case 1: + mulAvxTwo_3x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_3x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_3x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_3x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_3x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_3x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_3x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_3x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_3x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_3x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 4: + switch len(out) { + case 1: + mulAvxTwo_4x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_4x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_4x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_4x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_4x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_4x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_4x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_4x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_4x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_4x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 5: + switch len(out) { + case 1: + mulAvxTwo_5x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_5x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_5x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_5x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_5x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_5x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_5x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_5x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_5x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_5x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 6: + switch len(out) { + case 1: + mulAvxTwo_6x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_6x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_6x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_6x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_6x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_6x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_6x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_6x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_6x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_6x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 7: + switch len(out) { + case 1: + mulAvxTwo_7x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_7x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_7x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_7x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_7x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_7x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_7x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_7x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_7x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_7x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 8: + switch len(out) { + case 1: + mulAvxTwo_8x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_8x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_8x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_8x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_8x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_8x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_8x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_8x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_8x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_8x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 9: + switch len(out) { + case 1: + mulAvxTwo_9x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_9x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_9x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_9x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_9x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_9x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_9x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_9x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_9x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_9x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + case 10: + switch len(out) { + case 1: + mulAvxTwo_10x1_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 2: + mulAvxTwo_10x2_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 3: + mulAvxTwo_10x3_64Xor(matrix, in, out, start, n) + return n & (maxInt - 63) + case 4: + mulAvxTwo_10x4Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 5: + mulAvxTwo_10x5Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 6: + mulAvxTwo_10x6Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 7: + mulAvxTwo_10x7Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 8: + mulAvxTwo_10x8Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 9: + mulAvxTwo_10x9Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + case 10: + mulAvxTwo_10x10Xor(matrix, in, out, start, n) + return n & (maxInt - 31) + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (64 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulGFNI_1x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_1x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_1x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_1x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_1x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_1x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_1x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_1x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_1x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_1x10_64(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulGFNI_2x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_2x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_2x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_2x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_2x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_2x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_2x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_2x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_2x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_2x10_64(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulGFNI_3x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_3x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_3x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_3x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_3x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_3x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_3x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_3x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_3x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_3x10_64(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulGFNI_4x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_4x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_4x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_4x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_4x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_4x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_4x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_4x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_4x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_4x10_64(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulGFNI_5x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_5x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_5x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_5x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_5x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_5x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_5x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_5x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_5x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_5x10_64(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulGFNI_6x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_6x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_6x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_6x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_6x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_6x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_6x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_6x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_6x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_6x10_64(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulGFNI_7x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_7x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_7x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_7x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_7x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_7x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_7x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_7x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_7x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_7x10_64(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulGFNI_8x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_8x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_8x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_8x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_8x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_8x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_8x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_8x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_8x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_8x10_64(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulGFNI_9x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_9x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_9x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_9x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_9x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_9x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_9x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_9x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_9x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_9x10_64(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulGFNI_10x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_10x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_10x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_10x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_10x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_10x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_10x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_10x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_10x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_10x10_64(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (64 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulGFNI_1x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_1x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_1x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_1x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_1x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_1x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_1x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_1x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_1x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_1x10_64Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulGFNI_2x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_2x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_2x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_2x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_2x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_2x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_2x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_2x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_2x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_2x10_64Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulGFNI_3x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_3x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_3x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_3x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_3x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_3x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_3x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_3x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_3x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_3x10_64Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulGFNI_4x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_4x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_4x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_4x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_4x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_4x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_4x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_4x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_4x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_4x10_64Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulGFNI_5x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_5x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_5x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_5x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_5x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_5x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_5x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_5x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_5x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_5x10_64Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulGFNI_6x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_6x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_6x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_6x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_6x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_6x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_6x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_6x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_6x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_6x10_64Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulGFNI_7x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_7x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_7x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_7x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_7x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_7x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_7x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_7x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_7x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_7x10_64Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulGFNI_8x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_8x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_8x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_8x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_8x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_8x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_8x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_8x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_8x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_8x10_64Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulGFNI_9x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_9x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_9x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_9x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_9x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_9x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_9x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_9x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_9x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_9x10_64Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulGFNI_10x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_10x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_10x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_10x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_10x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_10x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_10x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_10x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_10x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_10x10_64Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go new file mode 100644 index 000000000..1ba08b5e1 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go @@ -0,0 +1,1372 @@ +// Code generated by command: go generate gen.go. DO NOT EDIT. + +//go:build !appengine && !noasm && gc && !nogen && nopshufb +// +build !appengine,!noasm,gc,!nogen,nopshufb + +package reedsolomon + +import ( + "fmt" +) + +const ( + avx2CodeGen = true + maxAvx2Inputs = 10 + maxAvx2Outputs = 10 + minAvx2Size = 64 +) + +func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } + +func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (64 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulGFNI_1x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_1x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_1x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_1x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_1x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_1x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_1x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_1x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_1x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_1x10_64(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulGFNI_2x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_2x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_2x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_2x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_2x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_2x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_2x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_2x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_2x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_2x10_64(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulGFNI_3x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_3x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_3x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_3x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_3x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_3x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_3x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_3x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_3x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_3x10_64(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulGFNI_4x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_4x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_4x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_4x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_4x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_4x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_4x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_4x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_4x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_4x10_64(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulGFNI_5x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_5x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_5x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_5x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_5x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_5x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_5x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_5x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_5x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_5x10_64(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulGFNI_6x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_6x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_6x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_6x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_6x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_6x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_6x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_6x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_6x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_6x10_64(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulGFNI_7x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_7x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_7x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_7x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_7x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_7x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_7x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_7x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_7x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_7x10_64(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulGFNI_8x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_8x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_8x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_8x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_8x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_8x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_8x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_8x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_8x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_8x10_64(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulGFNI_9x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_9x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_9x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_9x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_9x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_9x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_9x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_9x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_9x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_9x10_64(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulGFNI_10x1_64(matrix, in, out, start, n) + return n + case 2: + mulGFNI_10x2_64(matrix, in, out, start, n) + return n + case 3: + mulGFNI_10x3_64(matrix, in, out, start, n) + return n + case 4: + mulGFNI_10x4_64(matrix, in, out, start, n) + return n + case 5: + mulGFNI_10x5_64(matrix, in, out, start, n) + return n + case 6: + mulGFNI_10x6_64(matrix, in, out, start, n) + return n + case 7: + mulGFNI_10x7_64(matrix, in, out, start, n) + return n + case 8: + mulGFNI_10x8_64(matrix, in, out, start, n) + return n + case 9: + mulGFNI_10x9_64(matrix, in, out, start, n) + return n + case 10: + mulGFNI_10x10_64(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (64 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulGFNI_1x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_1x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_1x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_1x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_1x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_1x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_1x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_1x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_1x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_1x10_64Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulGFNI_2x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_2x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_2x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_2x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_2x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_2x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_2x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_2x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_2x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_2x10_64Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulGFNI_3x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_3x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_3x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_3x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_3x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_3x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_3x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_3x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_3x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_3x10_64Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulGFNI_4x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_4x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_4x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_4x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_4x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_4x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_4x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_4x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_4x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_4x10_64Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulGFNI_5x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_5x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_5x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_5x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_5x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_5x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_5x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_5x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_5x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_5x10_64Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulGFNI_6x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_6x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_6x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_6x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_6x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_6x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_6x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_6x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_6x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_6x10_64Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulGFNI_7x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_7x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_7x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_7x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_7x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_7x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_7x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_7x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_7x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_7x10_64Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulGFNI_8x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_8x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_8x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_8x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_8x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_8x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_8x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_8x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_8x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_8x10_64Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulGFNI_9x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_9x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_9x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_9x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_9x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_9x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_9x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_9x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_9x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_9x10_64Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulGFNI_10x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulGFNI_10x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulGFNI_10x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulGFNI_10x4_64Xor(matrix, in, out, start, n) + return n + case 5: + mulGFNI_10x5_64Xor(matrix, in, out, start, n) + return n + case 6: + mulGFNI_10x6_64Xor(matrix, in, out, start, n) + return n + case 7: + mulGFNI_10x7_64Xor(matrix, in, out, start, n) + return n + case 8: + mulGFNI_10x8_64Xor(matrix, in, out, start, n) + return n + case 9: + mulGFNI_10x9_64Xor(matrix, in, out, start, n) + return n + case 10: + mulGFNI_10x10_64Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go new file mode 100644 index 000000000..fb5a3b654 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go @@ -0,0 +1,91 @@ +//go:build (!amd64 || noasm || appengine || gccgo) && (!arm64 || noasm || appengine || gccgo || nopshufb) && (!ppc64le || noasm || appengine || gccgo || nopshufb) + +// Copyright 2015, Klaus Post, see LICENSE for details. + +package reedsolomon + +const pshufb = false + +func galMulSlice(c byte, in, out []byte, o *options) { + out = out[:len(in)] + if c == 1 { + copy(out, in) + return + } + mt := mulTable[c][:256] + for n, input := range in { + out[n] = mt[input] + } +} + +func galMulSliceXor(c byte, in, out []byte, o *options) { + out = out[:len(in)] + if c == 1 { + sliceXor(in, out, o) + return + } + mt := mulTable[c][:256] + for n, input := range in { + out[n] ^= mt[input] + } +} + +func init() { + defaultOptions.useAVX512 = false +} + +// 4-way butterfly +func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 2-way butterfly forward +func fftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + refMulAdd(x, y, log_m) + sliceXorGo(x, y, o) +} + +// 2-way butterfly forward +func fftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + refMulAdd8(x, y, log_m) + sliceXorGo(x, y, o) +} + +// 2-way butterfly inverse +func ifftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + sliceXorGo(x, y, o) + refMulAdd(x, y, log_m) +} + +// 2-way butterfly inverse +func ifftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + sliceXorGo(x, y, o) + refMulAdd8(x, y, log_m) +} + +func mulgf16(x, y []byte, log_m ffe, o *options) { + refMul(x, y, log_m) +} + +func mulgf8(x, y []byte, log_m ffe8, o *options) { + refMul8(x, y, log_m) +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_nopshufb_amd64.go new file mode 100644 index 000000000..89c74e242 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_nopshufb_amd64.go @@ -0,0 +1,146 @@ +// Copyright 2015, Klaus Post, see LICENSE for details + +//go:build nopshufb && !noasm + +package reedsolomon + +// bigSwitchover is the size where 64 bytes are processed per loop. +const bigSwitchover = 128 + +const pshufb = false + +// simple slice xor +func sliceXor(in, out []byte, o *options) { + if o.useSSE2 { + if len(in) >= bigSwitchover { + if o.useAVX2 { + avx2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } else { + sSE2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + } + if len(in) >= 16 { + sSE2XorSlice(in, out) + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] + } + } else { + sliceXorGo(in, out, o) + return + } + out = out[:len(in)] + for i := range in { + out[i] ^= in[i] + } +} + +func galMulSlice(c byte, in, out []byte, o *options) { + out = out[:len(in)] + if c == 1 { + copy(out, in) + return + } + mt := mulTable[c][:256] + for len(in) >= 4 { + ii := (*[4]byte)(in) + oo := (*[4]byte)(out) + oo[0] = mt[ii[0]] + oo[1] = mt[ii[1]] + oo[2] = mt[ii[2]] + oo[3] = mt[ii[3]] + in = in[4:] + out = out[4:] + } + for n, input := range in { + out[n] = mt[input] + } +} + +func galMulSliceXor(c byte, in, out []byte, o *options) { + out = out[:len(in)] + if c == 1 { + sliceXor(in, out, o) + return + } + mt := mulTable[c][:256] + for len(in) >= 4 { + ii := (*[4]byte)(in) + oo := (*[4]byte)(out) + oo[0] ^= mt[ii[0]] + oo[1] ^= mt[ii[1]] + oo[2] ^= mt[ii[2]] + oo[3] ^= mt[ii[3]] + in = in[4:] + out = out[4:] + } + for n, input := range in { + out[n] ^= mt[input] + } +} + +func init() { + defaultOptions.useAVX512 = false +} + +// 4-way butterfly +func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 2-way butterfly forward +func fftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + refMulAdd(x, y, log_m) + sliceXor(x, y, o) +} + +// 2-way butterfly forward +func fftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + refMulAdd8(x, y, log_m) + sliceXor(x, y, o) +} + +// 2-way butterfly inverse +func ifftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + sliceXor(x, y, o) + refMulAdd(x, y, log_m) +} + +// 2-way butterfly inverse +func ifftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + sliceXor(x, y, o) + refMulAdd8(x, y, log_m) +} + +func mulgf16(x, y []byte, log_m ffe, o *options) { + refMul(x, y, log_m) +} + +func mulgf8(x, y []byte, log_m ffe8, o *options) { + refMul8(x, y, log_m) +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go b/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go new file mode 100644 index 000000000..f98bfed11 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_notamd64.go @@ -0,0 +1,13 @@ +//go:build !amd64 || noasm || appengine || gccgo || pshufb + +// Copyright 2020, Klaus Post, see LICENSE for details. + +package reedsolomon + +func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) { + panic("codeSomeShardsAvx512 should not be called if built without asm") +} + +func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) { + panic("codeSomeShardsAvx512P should not be called if built without asm") +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go new file mode 100644 index 000000000..c4c80351f --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go @@ -0,0 +1,146 @@ +//go:build !noasm && !appengine && !gccgo && !nopshufb + +// Copyright 2015, Klaus Post, see LICENSE for details. +// Copyright 2018, Minio, Inc. + +package reedsolomon + +const pshufb = true + +//go:noescape +func galMulPpc(low, high, in, out []byte) + +//go:noescape +func galMulPpcXor(low, high, in, out []byte) + +// This is what the assembler routines do in blocks of 16 bytes: +/* +func galMulPpc(low, high, in, out []byte) { + for n, input := range in { + l := input & 0xf + h := input >> 4 + out[n] = low[l] ^ high[h] + } +} +func galMulPpcXor(low, high, in, out []byte) { + for n, input := range in { + l := input & 0xf + h := input >> 4 + out[n] ^= low[l] ^ high[h] + } +} +*/ + +func galMulSlice(c byte, in, out []byte, o *options) { + if c == 1 { + copy(out, in) + return + } + done := (len(in) >> 4) << 4 + if done > 0 { + galMulPpc(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out) + } + remain := len(in) - done + if remain > 0 { + mt := mulTable[c][:256] + for i := done; i < len(in); i++ { + out[i] = mt[in[i]] + } + } +} + +func galMulSliceXor(c byte, in, out []byte, o *options) { + if c == 1 { + sliceXor(in, out, o) + return + } + done := (len(in) >> 4) << 4 + if done > 0 { + galMulPpcXor(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out) + } + remain := len(in) - done + if remain > 0 { + mt := mulTable[c][:256] + for i := done; i < len(in); i++ { + out[i] ^= mt[in[i]] + } + } +} + +// 4-way butterfly +func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o) +} + +// 2-way butterfly forward +func fftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + refMulAdd(x, y, log_m) + sliceXorGo(x, y, o) +} + +// 2-way butterfly forward +func fftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + mulAdd8(x, y, log_m, o) + sliceXorGo(x, y, o) +} + +// 2-way butterfly inverse +func ifftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + sliceXorGo(x, y, o) + refMulAdd(x, y, log_m) +} + +// 2-way butterfly inverse +func ifftDIT28(x, y []byte, log_m ffe8, o *options) { + // Reference version: + sliceXorGo(x, y, o) + mulAdd8(x, y, log_m, o) +} + +func mulgf16(x, y []byte, log_m ffe, o *options) { + refMul(x, y, log_m) +} + +func mulAdd8(out, in []byte, log_m ffe8, o *options) { + t := &multiply256LUT8[log_m] + galMulPpcXor(t[:16], t[16:32], in, out) + done := (len(in) >> 4) << 4 + in = in[done:] + if len(in) > 0 { + out = out[done:] + refMulAdd8(in, out, log_m) + } +} + +func mulgf8(out, in []byte, log_m ffe8, o *options) { + var done int + t := &multiply256LUT8[log_m] + galMulPpc(t[:16], t[16:32], in, out) + done = (len(in) >> 4) << 4 + + remain := len(in) - done + if remain > 0 { + mt := mul8LUTs[log_m].Value[:] + for i := done; i < len(in); i++ { + out[i] ^= byte(mt[in[i]]) + } + } +} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s new file mode 100644 index 000000000..c585c2b64 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s @@ -0,0 +1,127 @@ +//+build !noasm +//+build !appengine +//+build !gccgo +//+build !pshufb + +// Copyright 2015, Klaus Post, see LICENSE for details. +// Copyright 2018, Minio, Inc. + +#include "textflag.h" + +#define LOW R3 +#define HIGH R4 +#define IN R5 +#define LEN R6 +#define OUT R7 +#define CONSTANTS R8 +#define OFFSET R9 +#define OFFSET1 R10 +#define OFFSET2 R11 + +#define X6 VS34 +#define X6_ V2 +#define X7 VS35 +#define X7_ V3 +#define MSG VS36 +#define MSG_ V4 +#define MSG_HI VS37 +#define MSG_HI_ V5 +#define RESULT VS38 +#define RESULT_ V6 +#define ROTATE VS39 +#define ROTATE_ V7 +#define MASK VS40 +#define MASK_ V8 +#define FLIP VS41 +#define FLIP_ V9 + +// func galMulPpc(low, high, in, out []byte) +TEXT ·galMulPpc(SB), NOFRAME|NOSPLIT, $0-96 + MOVD low+0(FP), LOW + MOVD high+24(FP), HIGH + MOVD in+48(FP), IN + MOVD in_len+56(FP), LEN + MOVD out+72(FP), OUT + + MOVD $16, OFFSET1 + MOVD $32, OFFSET2 + + MOVD $·constants(SB), CONSTANTS + LXVD2X (CONSTANTS)(R0), ROTATE + LXVD2X (CONSTANTS)(OFFSET1), MASK + LXVD2X (CONSTANTS)(OFFSET2), FLIP + + LXVD2X (LOW)(R0), X6 + LXVD2X (HIGH)(R0), X7 + VPERM X6_, V31, FLIP_, X6_ + VPERM X7_, V31, FLIP_, X7_ + + MOVD $0, OFFSET + +loop: + LXVD2X (IN)(OFFSET), MSG + + VSRB MSG_, ROTATE_, MSG_HI_ + VAND MSG_, MASK_, MSG_ + VPERM X6_, V31, MSG_, MSG_ + VPERM X7_, V31, MSG_HI_, MSG_HI_ + + VXOR MSG_, MSG_HI_, MSG_ + + STXVD2X MSG, (OUT)(OFFSET) + + ADD $16, OFFSET, OFFSET + CMP LEN, OFFSET + BGT loop + RET + +// func galMulPpcXorlow, high, in, out []byte) +TEXT ·galMulPpcXor(SB), NOFRAME|NOSPLIT, $0-96 + MOVD low+0(FP), LOW + MOVD high+24(FP), HIGH + MOVD in+48(FP), IN + MOVD in_len+56(FP), LEN + MOVD out+72(FP), OUT + + MOVD $16, OFFSET1 + MOVD $32, OFFSET2 + + MOVD $·constants(SB), CONSTANTS + LXVD2X (CONSTANTS)(R0), ROTATE + LXVD2X (CONSTANTS)(OFFSET1), MASK + LXVD2X (CONSTANTS)(OFFSET2), FLIP + + LXVD2X (LOW)(R0), X6 + LXVD2X (HIGH)(R0), X7 + VPERM X6_, V31, FLIP_, X6_ + VPERM X7_, V31, FLIP_, X7_ + + MOVD $0, OFFSET + +loopXor: + LXVD2X (IN)(OFFSET), MSG + LXVD2X (OUT)(OFFSET), RESULT + + VSRB MSG_, ROTATE_, MSG_HI_ + VAND MSG_, MASK_, MSG_ + VPERM X6_, V31, MSG_, MSG_ + VPERM X7_, V31, MSG_HI_, MSG_HI_ + + VXOR MSG_, MSG_HI_, MSG_ + VXOR MSG_, RESULT_, RESULT_ + + STXVD2X RESULT, (OUT)(OFFSET) + + ADD $16, OFFSET, OFFSET + CMP LEN, OFFSET + BGT loopXor + RET + +DATA ·constants+0x0(SB)/8, $0x0404040404040404 +DATA ·constants+0x8(SB)/8, $0x0404040404040404 +DATA ·constants+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f +DATA ·constants+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f +DATA ·constants+0x20(SB)/8, $0x0706050403020100 +DATA ·constants+0x28(SB)/8, $0x0f0e0d0c0b0a0908 + +GLOBL ·constants(SB), 8, $48 diff --git a/vendor/github.com/klauspost/reedsolomon/inversion_tree.go b/vendor/github.com/klauspost/reedsolomon/inversion_tree.go new file mode 100644 index 000000000..3f97f810a --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/inversion_tree.go @@ -0,0 +1,164 @@ +/** + * A thread-safe tree which caches inverted matrices. + * + * Copyright 2016, Peter Collins + */ + +package reedsolomon + +import ( + "errors" + "sync" +) + +// The tree uses a Reader-Writer mutex to make it thread-safe +// when accessing cached matrices and inserting new ones. +type inversionTree struct { + mutex sync.RWMutex + root inversionNode +} + +type inversionNode struct { + matrix matrix + children []*inversionNode +} + +// newInversionTree initializes a tree for storing inverted matrices. +// Note that the root node is the identity matrix as it implies +// there were no errors with the original data. +func newInversionTree(dataShards, parityShards int) *inversionTree { + identity, _ := identityMatrix(dataShards) + return &inversionTree{ + root: inversionNode{ + matrix: identity, + children: make([]*inversionNode, dataShards+parityShards), + }, + } +} + +// GetInvertedMatrix returns the cached inverted matrix or nil if it +// is not found in the tree keyed on the indices of invalid rows. +func (t *inversionTree) GetInvertedMatrix(invalidIndices []int) matrix { + if t == nil { + return nil + } + // Lock the tree for reading before accessing the tree. + t.mutex.RLock() + defer t.mutex.RUnlock() + + // If no invalid indices were give we should return the root + // identity matrix. + if len(invalidIndices) == 0 { + return t.root.matrix + } + + // Recursively search for the inverted matrix in the tree, passing in + // 0 as the parent index as we start at the root of the tree. + return t.root.getInvertedMatrix(invalidIndices, 0) +} + +// errAlreadySet is returned if the root node matrix is overwritten +var errAlreadySet = errors.New("the root node identity matrix is already set") + +// InsertInvertedMatrix inserts a new inverted matrix into the tree +// keyed by the indices of invalid rows. The total number of shards +// is required for creating the proper length lists of child nodes for +// each node. +func (t *inversionTree) InsertInvertedMatrix(invalidIndices []int, matrix matrix, shards int) error { + if t == nil { + return nil + } + // If no invalid indices were given then we are done because the + // root node is already set with the identity matrix. + if len(invalidIndices) == 0 { + return errAlreadySet + } + + if !matrix.IsSquare() { + return errNotSquare + } + + // Lock the tree for writing and reading before accessing the tree. + t.mutex.Lock() + defer t.mutex.Unlock() + + // Recursively create nodes for the inverted matrix in the tree until + // we reach the node to insert the matrix to. We start by passing in + // 0 as the parent index as we start at the root of the tree. + t.root.insertInvertedMatrix(invalidIndices, matrix, shards, 0) + + return nil +} + +func (n *inversionNode) getInvertedMatrix(invalidIndices []int, parent int) matrix { + // Get the child node to search next from the list of children. The + // list of children starts relative to the parent index passed in + // because the indices of invalid rows is sorted (by default). As we + // search recursively, the first invalid index gets popped off the list, + // so when searching through the list of children, use that first invalid + // index to find the child node. + firstIndex := invalidIndices[0] + node := n.children[firstIndex-parent] + + // If the child node doesn't exist in the list yet, fail fast by + // returning, so we can construct and insert the proper inverted matrix. + if node == nil { + return nil + } + + // If there's more than one invalid index left in the list we should + // keep searching recursively. + if len(invalidIndices) > 1 { + // Search recursively on the child node by passing in the invalid indices + // with the first index popped off the front. Also the parent index to + // pass down is the first index plus one. + return node.getInvertedMatrix(invalidIndices[1:], firstIndex+1) + } + // If there aren't any more invalid indices to search, we've found our + // node. Return it, however keep in mind that the matrix could still be + // nil because intermediary nodes in the tree are created sometimes with + // their inversion matrices uninitialized. + return node.matrix +} + +func (n *inversionNode) insertInvertedMatrix(invalidIndices []int, matrix matrix, shards, parent int) { + // As above, get the child node to search next from the list of children. + // The list of children starts relative to the parent index passed in + // because the indices of invalid rows is sorted (by default). As we + // search recursively, the first invalid index gets popped off the list, + // so when searching through the list of children, use that first invalid + // index to find the child node. + firstIndex := invalidIndices[0] + node := n.children[firstIndex-parent] + + // If the child node doesn't exist in the list yet, create a new + // node because we have the writer lock and add it to the list + // of children. + if node == nil { + // Make the length of the list of children equal to the number + // of shards minus the first invalid index because the list of + // invalid indices is sorted, so only this length of errors + // are possible in the tree. + node = &inversionNode{ + children: make([]*inversionNode, shards-firstIndex), + } + // Insert the new node into the tree at the first index relative + // to the parent index that was given in this recursive call. + n.children[firstIndex-parent] = node + } + + // If there's more than one invalid index left in the list we should + // keep searching recursively in order to find the node to add our + // matrix. + if len(invalidIndices) > 1 { + // As above, search recursively on the child node by passing in + // the invalid indices with the first index popped off the front. + // Also the total number of shards and parent index are passed down + // which is equal to the first index plus one. + node.insertInvertedMatrix(invalidIndices[1:], matrix, shards, firstIndex+1) + } else { + // If there aren't any more invalid indices to search, we've found our + // node. Cache the inverted matrix in this node. + node.matrix = matrix + } +} diff --git a/vendor/github.com/klauspost/reedsolomon/leopard.go b/vendor/github.com/klauspost/reedsolomon/leopard.go new file mode 100644 index 000000000..6b4c80184 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/leopard.go @@ -0,0 +1,1262 @@ +package reedsolomon + +// This is a O(n*log n) implementation of Reed-Solomon +// codes, ported from the C++ library https://github.com/catid/leopard. +// +// The implementation is based on the paper +// +// S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung, +// "Novel Polynomial Basis with Fast Fourier Transform +// and Its Application to Reed-Solomon Erasure Codes" +// IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016. + +import ( + "bytes" + "io" + "math/bits" + "sync" + "unsafe" + + "github.com/klauspost/cpuid/v2" +) + +// leopardFF16 is like reedSolomon but for more than 256 total shards. +type leopardFF16 struct { + dataShards int // Number of data shards, should not be modified. + parityShards int // Number of parity shards, should not be modified. + totalShards int // Total number of shards. Calculated, and should not be modified. + + workPool sync.Pool + + o options +} + +// newFF16 is like New, but for more than 256 total shards. +func newFF16(dataShards, parityShards int, opt options) (*leopardFF16, error) { + initConstants() + + if dataShards <= 0 || parityShards <= 0 { + return nil, ErrInvShardNum + } + + if dataShards+parityShards > 65536 { + return nil, ErrMaxShardNum + } + + r := &leopardFF16{ + dataShards: dataShards, + parityShards: parityShards, + totalShards: dataShards + parityShards, + o: opt, + } + return r, nil +} + +var _ = Extensions(&leopardFF16{}) + +func (r *leopardFF16) ShardSizeMultiple() int { + return 64 +} + +func (r *leopardFF16) DataShards() int { + return r.dataShards +} + +func (r *leopardFF16) ParityShards() int { + return r.parityShards +} + +func (r *leopardFF16) TotalShards() int { + return r.totalShards +} + +func (r *leopardFF16) AllocAligned(each int) [][]byte { + return AllocAligned(r.totalShards, each) +} + +type ffe uint16 + +const ( + bitwidth = 16 + order = 1 << bitwidth + modulus = order - 1 + polynomial = 0x1002D +) + +var ( + fftSkew *[modulus]ffe + logWalsh *[order]ffe +) + +// Logarithm Tables +var ( + logLUT *[order]ffe + expLUT *[order]ffe +) + +// Stores the partial products of x * y at offset x + y * 65536 +// Repeated accesses from the same y value are faster +var mul16LUTs *[order]mul16LUT + +type mul16LUT struct { + // Contains Lo product as a single lookup. + // Should be XORed with Hi lookup for result. + Lo [256]ffe + Hi [256]ffe +} + +// Stores lookup for avx2 +var multiply256LUT *[order][8 * 16]byte + +func (r *leopardFF16) Encode(shards [][]byte) error { + if len(shards) != r.totalShards { + return ErrTooFewShards + } + + if err := checkShards(shards, false); err != nil { + return err + } + return r.encode(shards) +} + +func (r *leopardFF16) encode(shards [][]byte) error { + shardSize := shardSize(shards) + if shardSize%64 != 0 { + return ErrInvalidShardSize + } + + m := ceilPow2(r.parityShards) + var work [][]byte + if w, ok := r.workPool.Get().([][]byte); ok { + work = w + } + if cap(work) >= m*2 { + work = work[:m*2] + } else { + work = AllocAligned(m*2, shardSize) + } + for i := range work { + if cap(work[i]) < shardSize { + work[i] = AllocAligned(1, shardSize)[0] + } else { + work[i] = work[i][:shardSize] + } + } + defer r.workPool.Put(work) + + mtrunc := m + if r.dataShards < mtrunc { + mtrunc = r.dataShards + } + + skewLUT := fftSkew[m-1:] + + sh := shards + ifftDITEncoder( + sh[:r.dataShards], + mtrunc, + work, + nil, // No xor output + m, + skewLUT, + &r.o, + ) + + lastCount := r.dataShards % m + if m >= r.dataShards { + goto skip_body + } + + // For sets of m data pieces: + for i := m; i+m <= r.dataShards; i += m { + sh = sh[m:] + skewLUT = skewLUT[m:] + + // work <- work xor IFFT(data + i, m, m + i) + + ifftDITEncoder( + sh, // data source + m, + work[m:], // temporary workspace + work, // xor destination + m, + skewLUT, + &r.o, + ) + } + + // Handle final partial set of m pieces: + if lastCount != 0 { + sh = sh[m:] + skewLUT = skewLUT[m:] + + // work <- work xor IFFT(data + i, m, m + i) + + ifftDITEncoder( + sh, // data source + lastCount, + work[m:], // temporary workspace + work, // xor destination + m, + skewLUT, + &r.o, + ) + } + +skip_body: + // work <- FFT(work, m, 0) + fftDIT(work, r.parityShards, m, fftSkew[:], &r.o) + + for i, w := range work[:r.parityShards] { + sh := shards[i+r.dataShards] + if cap(sh) >= shardSize { + sh = append(sh[:0], w...) + } else { + sh = w + } + shards[i+r.dataShards] = sh + } + + return nil +} + +func (r *leopardFF16) EncodeIdx(dataShard []byte, idx int, parity [][]byte) error { + return ErrNotSupported +} + +func (r *leopardFF16) Join(dst io.Writer, shards [][]byte, outSize int) error { + // Do we have enough shards? + if len(shards) < r.dataShards { + return ErrTooFewShards + } + shards = shards[:r.dataShards] + + // Do we have enough data? + size := 0 + for _, shard := range shards { + if shard == nil { + return ErrReconstructRequired + } + size += len(shard) + + // Do we have enough data already? + if size >= outSize { + break + } + } + if size < outSize { + return ErrShortData + } + + // Copy data to dst + write := outSize + for _, shard := range shards { + if write < len(shard) { + _, err := dst.Write(shard[:write]) + return err + } + n, err := dst.Write(shard) + if err != nil { + return err + } + write -= n + } + return nil +} + +func (r *leopardFF16) Update(shards [][]byte, newDatashards [][]byte) error { + return ErrNotSupported +} + +func (r *leopardFF16) Split(data []byte) ([][]byte, error) { + if len(data) == 0 { + return nil, ErrShortData + } + if r.totalShards == 1 && len(data)&63 == 0 { + return [][]byte{data}, nil + } + dataLen := len(data) + // Calculate number of bytes per data shard. + perShard := (len(data) + r.dataShards - 1) / r.dataShards + perShard = ((perShard + 63) / 64) * 64 + needTotal := r.totalShards * perShard + + if cap(data) > len(data) { + if cap(data) > needTotal { + data = data[:needTotal] + } else { + data = data[:cap(data)] + } + clear := data[dataLen:] + for i := range clear { + clear[i] = 0 + } + } + + // Only allocate memory if necessary + var padding [][]byte + if len(data) < needTotal { + // calculate maximum number of full shards in `data` slice + fullShards := len(data) / perShard + padding = AllocAligned(r.totalShards-fullShards, perShard) + if dataLen > perShard*fullShards { + // Copy partial shards + copyFrom := data[perShard*fullShards : dataLen] + for i := range padding { + if len(copyFrom) == 0 { + break + } + copyFrom = copyFrom[copy(padding[i], copyFrom):] + } + } + } else { + zero := data[dataLen : r.totalShards*perShard] + for i := range zero { + zero[i] = 0 + } + } + + // Split into equal-length shards. + dst := make([][]byte, r.totalShards) + i := 0 + for ; i < len(dst) && len(data) >= perShard; i++ { + dst[i] = data[:perShard:perShard] + data = data[perShard:] + } + + for j := 0; i+j < len(dst); j++ { + dst[i+j] = padding[0] + padding = padding[1:] + } + + return dst, nil +} + +func (r *leopardFF16) ReconstructSome(shards [][]byte, required []bool) error { + if len(required) == r.totalShards { + return r.reconstruct(shards, true) + } + return r.reconstruct(shards, false) +} + +func (r *leopardFF16) Reconstruct(shards [][]byte) error { + return r.reconstruct(shards, true) +} + +func (r *leopardFF16) ReconstructData(shards [][]byte) error { + return r.reconstruct(shards, false) +} + +func (r *leopardFF16) Verify(shards [][]byte) (bool, error) { + if len(shards) != r.totalShards { + return false, ErrTooFewShards + } + if err := checkShards(shards, false); err != nil { + return false, err + } + + // Re-encode parity shards to temporary storage. + shardSize := len(shards[0]) + outputs := make([][]byte, r.totalShards) + copy(outputs, shards[:r.dataShards]) + for i := r.dataShards; i < r.totalShards; i++ { + outputs[i] = make([]byte, shardSize) + } + if err := r.Encode(outputs); err != nil { + return false, err + } + + // Compare. + for i := r.dataShards; i < r.totalShards; i++ { + if !bytes.Equal(outputs[i], shards[i]) { + return false, nil + } + } + return true, nil +} + +func (r *leopardFF16) reconstruct(shards [][]byte, recoverAll bool) error { + if len(shards) != r.totalShards { + return ErrTooFewShards + } + + if err := checkShards(shards, true); err != nil { + return err + } + + // Quick check: are all of the shards present? If so, there's + // nothing to do. + numberPresent := 0 + dataPresent := 0 + for i := 0; i < r.totalShards; i++ { + if len(shards[i]) != 0 { + numberPresent++ + if i < r.dataShards { + dataPresent++ + } + } + } + if numberPresent == r.totalShards || !recoverAll && dataPresent == r.dataShards { + // Cool. All of the shards have data. We don't + // need to do anything. + return nil + } + + // Use only if we are missing less than 1/4 parity. + useBits := r.totalShards-numberPresent <= r.parityShards/4 + + // Check if we have enough to reconstruct. + if numberPresent < r.dataShards { + return ErrTooFewShards + } + + shardSize := shardSize(shards) + if shardSize%64 != 0 { + return ErrInvalidShardSize + } + + m := ceilPow2(r.parityShards) + n := ceilPow2(m + r.dataShards) + + const LEO_ERROR_BITFIELD_OPT = true + + // Fill in error locations. + var errorBits errorBitfield + var errLocs [order]ffe + for i := 0; i < r.parityShards; i++ { + if len(shards[i+r.dataShards]) == 0 { + errLocs[i] = 1 + if LEO_ERROR_BITFIELD_OPT && recoverAll { + errorBits.set(i) + } + } + } + for i := r.parityShards; i < m; i++ { + errLocs[i] = 1 + if LEO_ERROR_BITFIELD_OPT && recoverAll { + errorBits.set(i) + } + } + for i := 0; i < r.dataShards; i++ { + if len(shards[i]) == 0 { + errLocs[i+m] = 1 + if LEO_ERROR_BITFIELD_OPT { + errorBits.set(i + m) + } + } + } + + if LEO_ERROR_BITFIELD_OPT && useBits { + errorBits.prepare() + } + + // Evaluate error locator polynomial + fwht(&errLocs, order, m+r.dataShards) + + for i := 0; i < order; i++ { + errLocs[i] = ffe((uint(errLocs[i]) * uint(logWalsh[i])) % modulus) + } + + fwht(&errLocs, order, order) + + var work [][]byte + if w, ok := r.workPool.Get().([][]byte); ok { + work = w + } + if cap(work) >= n { + work = work[:n] + } else { + work = make([][]byte, n) + } + for i := range work { + if cap(work[i]) < shardSize { + work[i] = make([]byte, shardSize) + } else { + work[i] = work[i][:shardSize] + } + } + defer r.workPool.Put(work) + + // work <- recovery data + + for i := 0; i < r.parityShards; i++ { + if len(shards[i+r.dataShards]) != 0 { + mulgf16(work[i], shards[i+r.dataShards], errLocs[i], &r.o) + } else { + memclr(work[i]) + } + } + for i := r.parityShards; i < m; i++ { + memclr(work[i]) + } + + // work <- original data + + for i := 0; i < r.dataShards; i++ { + if len(shards[i]) != 0 { + mulgf16(work[m+i], shards[i], errLocs[m+i], &r.o) + } else { + memclr(work[m+i]) + } + } + for i := m + r.dataShards; i < n; i++ { + memclr(work[i]) + } + + // work <- IFFT(work, n, 0) + + ifftDITDecoder( + m+r.dataShards, + work, + n, + fftSkew[:], + &r.o, + ) + + // work <- FormalDerivative(work, n) + + for i := 1; i < n; i++ { + width := ((i ^ (i - 1)) + 1) >> 1 + slicesXor(work[i-width:i], work[i:i+width], &r.o) + } + + // work <- FFT(work, n, 0) truncated to m + dataShards + + outputCount := m + r.dataShards + + if LEO_ERROR_BITFIELD_OPT && useBits { + errorBits.fftDIT(work, outputCount, n, fftSkew[:], &r.o) + } else { + fftDIT(work, outputCount, n, fftSkew[:], &r.o) + } + + // Reveal erasures + // + // Original = -ErrLocator * FFT( Derivative( IFFT( ErrLocator * ReceivedData ) ) ) + // mul_mem(x, y, log_m, ) equals x[] = y[] * log_m + // + // mem layout: [Recovery Data (Power of Two = M)] [Original Data (K)] [Zero Padding out to N] + end := r.dataShards + if recoverAll { + end = r.totalShards + } + for i := 0; i < end; i++ { + if len(shards[i]) != 0 { + continue + } + if cap(shards[i]) >= shardSize { + shards[i] = shards[i][:shardSize] + } else { + shards[i] = make([]byte, shardSize) + } + if i >= r.dataShards { + // Parity shard. + mulgf16(shards[i], work[i-r.dataShards], modulus-errLocs[i-r.dataShards], &r.o) + } else { + // Data shard. + mulgf16(shards[i], work[i+m], modulus-errLocs[i+m], &r.o) + } + } + return nil +} + +// Basic no-frills version for decoder +func ifftDITDecoder(mtrunc int, work [][]byte, m int, skewLUT []ffe, o *options) { + // Decimation in time: Unroll 2 layers at a time + dist := 1 + dist4 := 4 + for dist4 <= m { + // For each set of dist*4 elements: + for r := 0; r < mtrunc; r += dist4 { + iend := r + dist + log_m01 := skewLUT[iend-1] + log_m02 := skewLUT[iend+dist-1] + log_m23 := skewLUT[iend+dist*2-1] + + // For each set of dist elements: + for i := r; i < iend; i++ { + ifftDIT4(work[i:], dist, log_m01, log_m23, log_m02, o) + } + } + dist = dist4 + dist4 <<= 2 + } + + // If there is one layer left: + if dist < m { + // Assuming that dist = m / 2 + if dist*2 != m { + panic("internal error") + } + + log_m := skewLUT[dist-1] + + if log_m == modulus { + slicesXor(work[dist:2*dist], work[:dist], o) + } else { + for i := 0; i < dist; i++ { + ifftDIT2( + work[i], + work[i+dist], + log_m, + o, + ) + } + } + } +} + +// In-place FFT for encoder and decoder +func fftDIT(work [][]byte, mtrunc, m int, skewLUT []ffe, o *options) { + // Decimation in time: Unroll 2 layers at a time + dist4 := m + dist := m >> 2 + for dist != 0 { + // For each set of dist*4 elements: + for r := 0; r < mtrunc; r += dist4 { + iend := r + dist + log_m01 := skewLUT[iend-1] + log_m02 := skewLUT[iend+dist-1] + log_m23 := skewLUT[iend+dist*2-1] + + // For each set of dist elements: + for i := r; i < iend; i++ { + fftDIT4( + work[i:], + dist, + log_m01, + log_m23, + log_m02, + o, + ) + } + } + dist4 = dist + dist >>= 2 + } + + // If there is one layer left: + if dist4 == 2 { + for r := 0; r < mtrunc; r += 2 { + log_m := skewLUT[r+1-1] + + if log_m == modulus { + sliceXor(work[r], work[r+1], o) + } else { + fftDIT2(work[r], work[r+1], log_m, o) + } + } + } +} + +// 4-way butterfly +func fftDIT4Ref(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + // First layer: + if log_m02 == modulus { + sliceXor(work[0], work[dist*2], o) + sliceXor(work[dist], work[dist*3], o) + } else { + fftDIT2(work[0], work[dist*2], log_m02, o) + fftDIT2(work[dist], work[dist*3], log_m02, o) + } + + // Second layer: + if log_m01 == modulus { + sliceXor(work[0], work[dist], o) + } else { + fftDIT2(work[0], work[dist], log_m01, o) + } + + if log_m23 == modulus { + sliceXor(work[dist*2], work[dist*3], o) + } else { + fftDIT2(work[dist*2], work[dist*3], log_m23, o) + } +} + +// Unrolled IFFT for encoder +func ifftDITEncoder(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m int, skewLUT []ffe, o *options) { + // I tried rolling the memcpy/memset into the first layer of the FFT and + // found that it only yields a 4% performance improvement, which is not + // worth the extra complexity. + for i := 0; i < mtrunc; i++ { + copy(work[i], data[i]) + } + for i := mtrunc; i < m; i++ { + memclr(work[i]) + } + + // I tried splitting up the first few layers into L3-cache sized blocks but + // found that it only provides about 5% performance boost, which is not + // worth the extra complexity. + + // Decimation in time: Unroll 2 layers at a time + dist := 1 + dist4 := 4 + for dist4 <= m { + // For each set of dist*4 elements: + for r := 0; r < mtrunc; r += dist4 { + iend := r + dist + log_m01 := skewLUT[iend] + log_m02 := skewLUT[iend+dist] + log_m23 := skewLUT[iend+dist*2] + + // For each set of dist elements: + for i := r; i < iend; i++ { + ifftDIT4( + work[i:], + dist, + log_m01, + log_m23, + log_m02, + o, + ) + } + } + + dist = dist4 + dist4 <<= 2 + // I tried alternating sweeps left->right and right->left to reduce cache misses. + // It provides about 1% performance boost when done for both FFT and IFFT, so it + // does not seem to be worth the extra complexity. + } + + // If there is one layer left: + if dist < m { + // Assuming that dist = m / 2 + if dist*2 != m { + panic("internal error") + } + + logm := skewLUT[dist] + + if logm == modulus { + slicesXor(work[dist:dist*2], work[:dist], o) + } else { + for i := 0; i < dist; i++ { + ifftDIT2(work[i], work[i+dist], logm, o) + } + } + } + + // I tried unrolling this but it does not provide more than 5% performance + // improvement for 16-bit finite fields, so it's not worth the complexity. + if xorRes != nil { + slicesXor(xorRes[:m], work[:m], o) + } +} + +func ifftDIT4Ref(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + // First layer: + if log_m01 == modulus { + sliceXor(work[0], work[dist], o) + } else { + ifftDIT2(work[0], work[dist], log_m01, o) + } + + if log_m23 == modulus { + sliceXor(work[dist*2], work[dist*3], o) + } else { + ifftDIT2(work[dist*2], work[dist*3], log_m23, o) + } + + // Second layer: + if log_m02 == modulus { + sliceXor(work[0], work[dist*2], o) + sliceXor(work[dist], work[dist*3], o) + } else { + ifftDIT2(work[0], work[dist*2], log_m02, o) + ifftDIT2(work[dist], work[dist*3], log_m02, o) + } +} + +// Reference version of muladd: x[] ^= y[] * log_m +func refMulAdd(x, y []byte, log_m ffe) { + lut := &mul16LUTs[log_m] + + for len(x) >= 64 { + // Assert sizes for no bounds checks in loop + hiA := y[32:64] + loA := y[:32] + dst := x[:64] // Needed, but not checked... + for i, lo := range loA { + hi := hiA[i] + prod := lut.Lo[lo] ^ lut.Hi[hi] + + dst[i] ^= byte(prod) + dst[i+32] ^= byte(prod >> 8) + } + x = x[64:] + y = y[64:] + } +} + +func memclr(s []byte) { + for i := range s { + s[i] = 0 + } +} + +// slicesXor calls xor for every slice pair in v1, v2. +func slicesXor(v1, v2 [][]byte, o *options) { + for i, v := range v1 { + sliceXor(v2[i], v, o) + } +} + +// Reference version of mul: x[] = y[] * log_m +func refMul(x, y []byte, log_m ffe) { + lut := &mul16LUTs[log_m] + + for off := 0; off < len(x); off += 64 { + loA := y[off : off+32] + hiA := y[off+32:] + hiA = hiA[:len(loA)] + for i, lo := range loA { + hi := hiA[i] + prod := lut.Lo[lo] ^ lut.Hi[hi] + + x[off+i] = byte(prod) + x[off+i+32] = byte(prod >> 8) + } + } +} + +// Returns a * Log(b) +func mulLog(a, log_b ffe) ffe { + /* + Note that this operation is not a normal multiplication in a finite + field because the right operand is already a logarithm. This is done + because it moves K table lookups from the Decode() method into the + initialization step that is less performance critical. The LogWalsh[] + table below contains precalculated logarithms so it is easier to do + all the other multiplies in that form as well. + */ + if a == 0 { + return 0 + } + return expLUT[addMod(logLUT[a], log_b)] +} + +// z = x + y (mod kModulus) +func addMod(a, b ffe) ffe { + sum := uint(a) + uint(b) + + // Partial reduction step, allowing for kModulus to be returned + return ffe(sum + sum>>bitwidth) +} + +// z = x - y (mod kModulus) +func subMod(a, b ffe) ffe { + dif := uint(a) - uint(b) + + // Partial reduction step, allowing for kModulus to be returned + return ffe(dif + dif>>bitwidth) +} + +// ceilPow2 returns power of two at or above n. +func ceilPow2(n int) int { + const w = int(unsafe.Sizeof(n) * 8) + return 1 << (w - bits.LeadingZeros(uint(n-1))) +} + +// Decimation in time (DIT) Fast Walsh-Hadamard Transform +// Unrolls pairs of layers to perform cross-layer operations in registers +// mtrunc: Number of elements that are non-zero at the front of data +func fwht(data *[order]ffe, m, mtrunc int) { + // Decimation in time: Unroll 2 layers at a time + dist := 1 + dist4 := 4 + for dist4 <= m { + // For each set of dist*4 elements: + for r := 0; r < mtrunc; r += dist4 { + // For each set of dist elements: + // Use 16 bit indices to avoid bounds check on [65536]ffe. + dist := uint16(dist) + off := uint16(r) + for i := uint16(0); i < dist; i++ { + // fwht4(data[i:], dist) inlined... + // Reading values appear faster than updating pointers. + // Casting to uint is not faster. + t0 := data[off] + t1 := data[off+dist] + t2 := data[off+dist*2] + t3 := data[off+dist*3] + + t0, t1 = fwht2alt(t0, t1) + t2, t3 = fwht2alt(t2, t3) + t0, t2 = fwht2alt(t0, t2) + t1, t3 = fwht2alt(t1, t3) + + data[off] = t0 + data[off+dist] = t1 + data[off+dist*2] = t2 + data[off+dist*3] = t3 + off++ + } + } + dist = dist4 + dist4 <<= 2 + } + + // If there is one layer left: + if dist < m { + dist := uint16(dist) + for i := uint16(0); i < dist; i++ { + fwht2(&data[i], &data[i+dist]) + } + } +} + +func fwht4(data []ffe, s int) { + s2 := s << 1 + + t0 := &data[0] + t1 := &data[s] + t2 := &data[s2] + t3 := &data[s2+s] + + fwht2(t0, t1) + fwht2(t2, t3) + fwht2(t0, t2) + fwht2(t1, t3) +} + +// {a, b} = {a + b, a - b} (Mod Q) +func fwht2(a, b *ffe) { + sum := addMod(*a, *b) + dif := subMod(*a, *b) + *a = sum + *b = dif +} + +// fwht2alt is as fwht2, but returns result. +func fwht2alt(a, b ffe) (ffe, ffe) { + return addMod(a, b), subMod(a, b) +} + +var initOnce sync.Once + +func initConstants() { + initOnce.Do(func() { + initLUTs() + initFFTSkew() + initMul16LUT() + }) +} + +// Initialize logLUT, expLUT. +func initLUTs() { + cantorBasis := [bitwidth]ffe{ + 0x0001, 0xACCA, 0x3C0E, 0x163E, + 0xC582, 0xED2E, 0x914C, 0x4012, + 0x6C98, 0x10D8, 0x6A72, 0xB900, + 0xFDB8, 0xFB34, 0xFF38, 0x991E, + } + + expLUT = &[order]ffe{} + logLUT = &[order]ffe{} + + // LFSR table generation: + state := 1 + for i := ffe(0); i < modulus; i++ { + expLUT[state] = i + state <<= 1 + if state >= order { + state ^= polynomial + } + } + expLUT[0] = modulus + + // Conversion to Cantor basis: + + logLUT[0] = 0 + for i := 0; i < bitwidth; i++ { + basis := cantorBasis[i] + width := 1 << i + + for j := 0; j < width; j++ { + logLUT[j+width] = logLUT[j] ^ basis + } + } + + for i := 0; i < order; i++ { + logLUT[i] = expLUT[logLUT[i]] + } + + for i := 0; i < order; i++ { + expLUT[logLUT[i]] = ffe(i) + } + + expLUT[modulus] = expLUT[0] +} + +// Initialize fftSkew. +func initFFTSkew() { + var temp [bitwidth - 1]ffe + + // Generate FFT skew vector {1}: + + for i := 1; i < bitwidth; i++ { + temp[i-1] = ffe(1 << i) + } + + fftSkew = &[modulus]ffe{} + logWalsh = &[order]ffe{} + + for m := 0; m < bitwidth-1; m++ { + step := 1 << (m + 1) + + fftSkew[1<>4)+16)] + lut.Hi[i] = tmp[((i&15)+32)] ^ tmp[((i>>4)+48)] + } + } + if cpuid.CPU.Has(cpuid.SSSE3) || cpuid.CPU.Has(cpuid.AVX2) || cpuid.CPU.Has(cpuid.AVX512F) { + multiply256LUT = &[order][16 * 8]byte{} + + for logM := range multiply256LUT[:] { + // For each 4 bits of the finite field width in bits: + shift := 0 + for i := 0; i < 4; i++ { + // Construct 16 entry LUT for PSHUFB + prodLo := multiply256LUT[logM][i*16 : i*16+16] + prodHi := multiply256LUT[logM][4*16+i*16 : 4*16+i*16+16] + for x := range prodLo[:] { + prod := mulLog(ffe(x<> 8) + } + shift += 4 + } + } + } +} + +const kWordMips = 5 +const kWords = order / 64 +const kBigMips = 6 +const kBigWords = (kWords + 63) / 64 +const kBiggestMips = 4 + +// errorBitfield contains progressive errors to help indicate which +// shards need reconstruction. +type errorBitfield struct { + Words [kWordMips][kWords]uint64 + BigWords [kBigMips][kBigWords]uint64 + BiggestWords [kBiggestMips]uint64 +} + +func (e *errorBitfield) set(i int) { + e.Words[0][i/64] |= uint64(1) << (i & 63) +} + +func (e *errorBitfield) isNeededFn(mipLevel int) func(bit int) bool { + if mipLevel >= 16 { + return func(bit int) bool { + return true + } + } + if mipLevel >= 12 { + w := e.BiggestWords[mipLevel-12] + return func(bit int) bool { + bit /= 4096 + return 0 != (w & (uint64(1) << bit)) + } + } + if mipLevel >= 6 { + w := e.BigWords[mipLevel-6][:] + return func(bit int) bool { + bit /= 64 + return 0 != (w[bit/64] & (uint64(1) << (bit & 63))) + } + } + if mipLevel > 0 { + w := e.Words[mipLevel-1][:] + return func(bit int) bool { + return 0 != (w[bit/64] & (uint64(1) << (bit & 63))) + } + } + return nil +} + +func (e *errorBitfield) isNeeded(mipLevel int, bit uint) bool { + if mipLevel >= 16 { + return true + } + if mipLevel >= 12 { + bit /= 4096 + return 0 != (e.BiggestWords[mipLevel-12] & (uint64(1) << bit)) + } + if mipLevel >= 6 { + bit /= 64 + return 0 != (e.BigWords[mipLevel-6][bit/64] & (uint64(1) << (bit % 64))) + } + return 0 != (e.Words[mipLevel-1][bit/64] & (uint64(1) << (bit % 64))) +} + +var kHiMasks = [5]uint64{ + 0xAAAAAAAAAAAAAAAA, + 0xCCCCCCCCCCCCCCCC, + 0xF0F0F0F0F0F0F0F0, + 0xFF00FF00FF00FF00, + 0xFFFF0000FFFF0000, +} + +func (e *errorBitfield) prepare() { + // First mip level is for final layer of FFT: pairs of data + for i := 0; i < kWords; i++ { + w_i := e.Words[0][i] + hi2lo0 := w_i | ((w_i & kHiMasks[0]) >> 1) + lo2hi0 := (w_i & (kHiMasks[0] >> 1)) << 1 + w_i = hi2lo0 | lo2hi0 + e.Words[0][i] = w_i + + bits := 2 + for j := 1; j < kWordMips; j++ { + hi2lo_j := w_i | ((w_i & kHiMasks[j]) >> bits) + lo2hi_j := (w_i & (kHiMasks[j] >> bits)) << bits + w_i = hi2lo_j | lo2hi_j + e.Words[j][i] = w_i + bits <<= 1 + } + } + + for i := 0; i < kBigWords; i++ { + w_i := uint64(0) + bit := uint64(1) + src := e.Words[kWordMips-1][i*64 : i*64+64] + for _, w := range src { + w_i |= (w | (w >> 32) | (w << 32)) & bit + bit <<= 1 + } + e.BigWords[0][i] = w_i + + bits := 1 + for j := 1; j < kBigMips; j++ { + hi2lo_j := w_i | ((w_i & kHiMasks[j-1]) >> bits) + lo2hi_j := (w_i & (kHiMasks[j-1] >> bits)) << bits + w_i = hi2lo_j | lo2hi_j + e.BigWords[j][i] = w_i + bits <<= 1 + } + } + + w_i := uint64(0) + bit := uint64(1) + for _, w := range e.BigWords[kBigMips-1][:kBigWords] { + w_i |= (w | (w >> 32) | (w << 32)) & bit + bit <<= 1 + } + e.BiggestWords[0] = w_i + + bits := uint64(1) + for j := 1; j < kBiggestMips; j++ { + hi2lo_j := w_i | ((w_i & kHiMasks[j-1]) >> bits) + lo2hi_j := (w_i & (kHiMasks[j-1] >> bits)) << bits + w_i = hi2lo_j | lo2hi_j + e.BiggestWords[j] = w_i + bits <<= 1 + } +} + +func (e *errorBitfield) fftDIT(work [][]byte, mtrunc, m int, skewLUT []ffe, o *options) { + // Decimation in time: Unroll 2 layers at a time + mipLevel := bits.Len32(uint32(m)) - 1 + + dist4 := m + dist := m >> 2 + needed := e.isNeededFn(mipLevel) + for dist != 0 { + // For each set of dist*4 elements: + for r := 0; r < mtrunc; r += dist4 { + if !needed(r) { + continue + } + iEnd := r + dist + logM01 := skewLUT[iEnd-1] + logM02 := skewLUT[iEnd+dist-1] + logM23 := skewLUT[iEnd+dist*2-1] + + // For each set of dist elements: + for i := r; i < iEnd; i++ { + fftDIT4( + work[i:], + dist, + logM01, + logM23, + logM02, + o) + } + } + dist4 = dist + dist >>= 2 + mipLevel -= 2 + needed = e.isNeededFn(mipLevel) + } + + // If there is one layer left: + if dist4 == 2 { + for r := 0; r < mtrunc; r += 2 { + if !needed(r) { + continue + } + logM := skewLUT[r+1-1] + + if logM == modulus { + sliceXor(work[r], work[r+1], o) + } else { + fftDIT2(work[r], work[r+1], logM, o) + } + } + } +} diff --git a/vendor/github.com/klauspost/reedsolomon/leopard8.go b/vendor/github.com/klauspost/reedsolomon/leopard8.go new file mode 100644 index 000000000..cd863a136 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/leopard8.go @@ -0,0 +1,1269 @@ +package reedsolomon + +// This is a O(n*log n) implementation of Reed-Solomon +// codes, ported from the C++ library https://github.com/catid/leopard. +// +// The implementation is based on the paper +// +// S.-J. Lin, T. Y. Al-Naffouri, Y. S. Han, and W.-H. Chung, +// "Novel Polynomial Basis with Fast Fourier Transform +// and Its Application to Reed-Solomon Erasure Codes" +// IEEE Trans. on Information Theory, pp. 6284-6299, November, 2016. + +import ( + "bytes" + "encoding/binary" + "io" + "math/bits" + "sync" +) + +// leopardFF8 is like reedSolomon but for the 8-bit "leopard" implementation. +type leopardFF8 struct { + dataShards int // Number of data shards, should not be modified. + parityShards int // Number of parity shards, should not be modified. + totalShards int // Total number of shards. Calculated, and should not be modified. + + workPool sync.Pool + inversion map[[inversion8Bytes]byte]leopardGF8cache + inversionMu sync.Mutex + + o options +} + +const inversion8Bytes = 256 / 8 + +type leopardGF8cache struct { + errorLocs [256]ffe8 + bits *errorBitfield8 +} + +// newFF8 is like New, but for the 8-bit "leopard" implementation. +func newFF8(dataShards, parityShards int, opt options) (*leopardFF8, error) { + initConstants8() + + if dataShards <= 0 || parityShards <= 0 { + return nil, ErrInvShardNum + } + + if dataShards+parityShards > 65536 { + return nil, ErrMaxShardNum + } + + r := &leopardFF8{ + dataShards: dataShards, + parityShards: parityShards, + totalShards: dataShards + parityShards, + o: opt, + } + if opt.inversionCache && (r.totalShards <= 64 || opt.forcedInversionCache) { + // Inversion cache is relatively ineffective for big shard counts and takes up potentially lots of memory + // r.totalShards is not covering the space, but an estimate. + r.inversion = make(map[[inversion8Bytes]byte]leopardGF8cache, r.totalShards) + } + return r, nil +} + +var _ = Extensions(&leopardFF8{}) + +func (r *leopardFF8) ShardSizeMultiple() int { + return 64 +} + +func (r *leopardFF8) DataShards() int { + return r.dataShards +} + +func (r *leopardFF8) ParityShards() int { + return r.parityShards +} + +func (r *leopardFF8) TotalShards() int { + return r.totalShards +} + +func (r *leopardFF8) AllocAligned(each int) [][]byte { + return AllocAligned(r.totalShards, each) +} + +type ffe8 uint8 + +const ( + bitwidth8 = 8 + order8 = 1 << bitwidth8 + modulus8 = order8 - 1 + polynomial8 = 0x11D + + // Encode in blocks of this size. + workSize8 = 32 << 10 +) + +var ( + fftSkew8 *[modulus8]ffe8 + logWalsh8 *[order8]ffe8 +) + +// Logarithm Tables +var ( + logLUT8 *[order8]ffe8 + expLUT8 *[order8]ffe8 +) + +// Stores the partial products of x * y at offset x + y * 256 +// Repeated accesses from the same y value are faster +var mul8LUTs *[order8]mul8LUT + +type mul8LUT struct { + Value [256]ffe8 +} + +// Stores lookup for avx2 +var multiply256LUT8 *[order8][2 * 16]byte + +func (r *leopardFF8) Encode(shards [][]byte) error { + if len(shards) != r.totalShards { + return ErrTooFewShards + } + + if err := checkShards(shards, false); err != nil { + return err + } + return r.encode(shards) +} + +func (r *leopardFF8) encode(shards [][]byte) error { + shardSize := shardSize(shards) + if shardSize%64 != 0 { + return ErrInvalidShardSize + } + + m := ceilPow2(r.parityShards) + var work [][]byte + if w, ok := r.workPool.Get().([][]byte); ok { + work = w + } else { + work = AllocAligned(m*2, workSize8) + } + if cap(work) >= m*2 { + work = work[:m*2] + for i := range work { + if i >= r.parityShards { + if cap(work[i]) < workSize8 { + work[i] = AllocAligned(1, workSize8)[0] + } else { + work[i] = work[i][:workSize8] + } + } + } + } else { + work = AllocAligned(m*2, workSize8) + } + + defer r.workPool.Put(work) + + mtrunc := m + if r.dataShards < mtrunc { + mtrunc = r.dataShards + } + + skewLUT := fftSkew8[m-1:] + + // Split large shards. + // More likely on lower shard count. + off := 0 + sh := make([][]byte, len(shards)) + + // work slice we can modify + wMod := make([][]byte, len(work)) + copy(wMod, work) + for off < shardSize { + work := wMod + sh := sh + end := off + workSize8 + if end > shardSize { + end = shardSize + sz := shardSize - off + for i := range work { + // Last iteration only... + work[i] = work[i][:sz] + } + } + for i := range shards { + sh[i] = shards[i][off:end] + } + + // Replace work slices, so we write directly to output. + // Note that work has parity *before* data shards. + res := shards[r.dataShards:r.totalShards] + for i := range res { + work[i] = res[i][off:end] + } + + ifftDITEncoder8( + sh[:r.dataShards], + mtrunc, + work, + nil, // No xor output + m, + skewLUT, + &r.o, + ) + + lastCount := r.dataShards % m + skewLUT2 := skewLUT + if m >= r.dataShards { + goto skip_body + } + + // For sets of m data pieces: + for i := m; i+m <= r.dataShards; i += m { + sh = sh[m:] + skewLUT2 = skewLUT2[m:] + + // work <- work xor IFFT(data + i, m, m + i) + + ifftDITEncoder8( + sh, // data source + m, + work[m:], // temporary workspace + work, // xor destination + m, + skewLUT2, + &r.o, + ) + } + + // Handle final partial set of m pieces: + if lastCount != 0 { + sh = sh[m:] + skewLUT2 = skewLUT2[m:] + + // work <- work xor IFFT(data + i, m, m + i) + + ifftDITEncoder8( + sh, // data source + lastCount, + work[m:], // temporary workspace + work, // xor destination + m, + skewLUT2, + &r.o, + ) + } + + skip_body: + // work <- FFT(work, m, 0) + fftDIT8(work, r.parityShards, m, fftSkew8[:], &r.o) + off += workSize8 + } + + return nil +} + +func (r *leopardFF8) EncodeIdx(dataShard []byte, idx int, parity [][]byte) error { + return ErrNotSupported +} + +func (r *leopardFF8) Join(dst io.Writer, shards [][]byte, outSize int) error { + // Do we have enough shards? + if len(shards) < r.dataShards { + return ErrTooFewShards + } + shards = shards[:r.dataShards] + + // Do we have enough data? + size := 0 + for _, shard := range shards { + if shard == nil { + return ErrReconstructRequired + } + size += len(shard) + + // Do we have enough data already? + if size >= outSize { + break + } + } + if size < outSize { + return ErrShortData + } + + // Copy data to dst + write := outSize + for _, shard := range shards { + if write < len(shard) { + _, err := dst.Write(shard[:write]) + return err + } + n, err := dst.Write(shard) + if err != nil { + return err + } + write -= n + } + return nil +} + +func (r *leopardFF8) Update(shards [][]byte, newDatashards [][]byte) error { + return ErrNotSupported +} + +func (r *leopardFF8) Split(data []byte) ([][]byte, error) { + if len(data) == 0 { + return nil, ErrShortData + } + if r.totalShards == 1 && len(data)&63 == 0 { + return [][]byte{data}, nil + } + + dataLen := len(data) + // Calculate number of bytes per data shard. + perShard := (len(data) + r.dataShards - 1) / r.dataShards + perShard = ((perShard + 63) / 64) * 64 + needTotal := r.totalShards * perShard + + if cap(data) > len(data) { + if cap(data) > needTotal { + data = data[:needTotal] + } else { + data = data[:cap(data)] + } + clear := data[dataLen:] + for i := range clear { + clear[i] = 0 + } + } + + // Only allocate memory if necessary + var padding [][]byte + if len(data) < needTotal { + // calculate maximum number of full shards in `data` slice + fullShards := len(data) / perShard + padding = AllocAligned(r.totalShards-fullShards, perShard) + if dataLen > perShard*fullShards { + // Copy partial shards + copyFrom := data[perShard*fullShards : dataLen] + for i := range padding { + if len(copyFrom) == 0 { + break + } + copyFrom = copyFrom[copy(padding[i], copyFrom):] + } + } + } + + // Split into equal-length shards. + dst := make([][]byte, r.totalShards) + i := 0 + for ; i < len(dst) && len(data) >= perShard; i++ { + dst[i] = data[:perShard:perShard] + data = data[perShard:] + } + + for j := 0; i+j < len(dst); j++ { + dst[i+j] = padding[0] + padding = padding[1:] + } + + return dst, nil +} + +func (r *leopardFF8) ReconstructSome(shards [][]byte, required []bool) error { + if len(required) == r.totalShards { + return r.reconstruct(shards, true) + } + return r.reconstruct(shards, false) +} + +func (r *leopardFF8) Reconstruct(shards [][]byte) error { + return r.reconstruct(shards, true) +} + +func (r *leopardFF8) ReconstructData(shards [][]byte) error { + return r.reconstruct(shards, false) +} + +func (r *leopardFF8) Verify(shards [][]byte) (bool, error) { + if len(shards) != r.totalShards { + return false, ErrTooFewShards + } + if err := checkShards(shards, false); err != nil { + return false, err + } + + // Re-encode parity shards to temporary storage. + shardSize := len(shards[0]) + outputs := make([][]byte, r.totalShards) + copy(outputs, shards[:r.dataShards]) + for i := r.dataShards; i < r.totalShards; i++ { + outputs[i] = make([]byte, shardSize) + } + if err := r.Encode(outputs); err != nil { + return false, err + } + + // Compare. + for i := r.dataShards; i < r.totalShards; i++ { + if !bytes.Equal(outputs[i], shards[i]) { + return false, nil + } + } + return true, nil +} + +func (r *leopardFF8) reconstruct(shards [][]byte, recoverAll bool) error { + if len(shards) != r.totalShards { + return ErrTooFewShards + } + + if err := checkShards(shards, true); err != nil { + return err + } + + // Quick check: are all of the shards present? If so, there's + // nothing to do. + numberPresent := 0 + dataPresent := 0 + for i := 0; i < r.totalShards; i++ { + if len(shards[i]) != 0 { + numberPresent++ + if i < r.dataShards { + dataPresent++ + } + } + } + if numberPresent == r.totalShards || !recoverAll && dataPresent == r.dataShards { + // Cool. All of the shards have data. We don't + // need to do anything. + return nil + } + + // Check if we have enough to reconstruct. + if numberPresent < r.dataShards { + return ErrTooFewShards + } + + shardSize := shardSize(shards) + if shardSize%64 != 0 { + return ErrInvalidShardSize + } + + // Use only if we are missing less than 1/4 parity, + // And we are restoring a significant amount of data. + useBits := r.totalShards-numberPresent <= r.parityShards/4 && shardSize*r.totalShards >= 64<<10 + + m := ceilPow2(r.parityShards) + n := ceilPow2(m + r.dataShards) + + const LEO_ERROR_BITFIELD_OPT = true + + // Fill in error locations. + var errorBits errorBitfield8 + var errLocs [order8]ffe8 + for i := 0; i < r.parityShards; i++ { + if len(shards[i+r.dataShards]) == 0 { + errLocs[i] = 1 + if LEO_ERROR_BITFIELD_OPT && recoverAll { + errorBits.set(i) + } + } + } + for i := r.parityShards; i < m; i++ { + errLocs[i] = 1 + if LEO_ERROR_BITFIELD_OPT && recoverAll { + errorBits.set(i) + } + } + for i := 0; i < r.dataShards; i++ { + if len(shards[i]) == 0 { + errLocs[i+m] = 1 + if LEO_ERROR_BITFIELD_OPT { + errorBits.set(i + m) + } + } + } + + var gotInversion bool + if LEO_ERROR_BITFIELD_OPT && r.inversion != nil { + cacheID := errorBits.cacheID() + r.inversionMu.Lock() + if inv, ok := r.inversion[cacheID]; ok { + r.inversionMu.Unlock() + errLocs = inv.errorLocs + if inv.bits != nil && useBits { + errorBits = *inv.bits + useBits = true + } else { + useBits = false + } + gotInversion = true + } else { + r.inversionMu.Unlock() + } + } + + if !gotInversion { + // No inversion... + if LEO_ERROR_BITFIELD_OPT && useBits { + errorBits.prepare() + } + + // Evaluate error locator polynomial8 + fwht8(&errLocs, order8, m+r.dataShards) + + for i := 0; i < order8; i++ { + errLocs[i] = ffe8((uint(errLocs[i]) * uint(logWalsh8[i])) % modulus8) + } + + fwht8(&errLocs, order8, order8) + + if r.inversion != nil { + c := leopardGF8cache{ + errorLocs: errLocs, + } + if useBits { + // Heap alloc + var x errorBitfield8 + x = errorBits + c.bits = &x + } + r.inversionMu.Lock() + r.inversion[errorBits.cacheID()] = c + r.inversionMu.Unlock() + } + } + + var work [][]byte + if w, ok := r.workPool.Get().([][]byte); ok { + work = w + } + if cap(work) >= n { + work = work[:n] + for i := range work { + if cap(work[i]) < workSize8 { + work[i] = make([]byte, workSize8) + } else { + work[i] = work[i][:workSize8] + } + } + + } else { + work = make([][]byte, n) + all := make([]byte, n*workSize8) + for i := range work { + work[i] = all[i*workSize8 : i*workSize8+workSize8] + } + } + defer r.workPool.Put(work) + + // work <- recovery data + + // Split large shards. + // More likely on lower shard count. + sh := make([][]byte, len(shards)) + // Copy... + copy(sh, shards) + + // Add output + for i, sh := range shards { + if !recoverAll && i >= r.dataShards { + continue + } + if len(sh) == 0 { + if cap(sh) >= shardSize { + shards[i] = sh[:shardSize] + } else { + shards[i] = make([]byte, shardSize) + } + } + } + + off := 0 + for off < shardSize { + endSlice := off + workSize8 + if endSlice > shardSize { + endSlice = shardSize + sz := shardSize - off + // Last iteration only + for i := range work { + work[i] = work[i][:sz] + } + } + for i := range shards { + if len(sh[i]) != 0 { + sh[i] = shards[i][off:endSlice] + } + } + for i := 0; i < r.parityShards; i++ { + if len(sh[i+r.dataShards]) != 0 { + mulgf8(work[i], sh[i+r.dataShards], errLocs[i], &r.o) + } else { + memclr(work[i]) + } + } + for i := r.parityShards; i < m; i++ { + memclr(work[i]) + } + + // work <- original data + + for i := 0; i < r.dataShards; i++ { + if len(sh[i]) != 0 { + mulgf8(work[m+i], sh[i], errLocs[m+i], &r.o) + } else { + memclr(work[m+i]) + } + } + for i := m + r.dataShards; i < n; i++ { + memclr(work[i]) + } + + // work <- IFFT(work, n, 0) + + ifftDITDecoder8( + m+r.dataShards, + work, + n, + fftSkew8[:], + &r.o, + ) + + // work <- FormalDerivative(work, n) + + for i := 1; i < n; i++ { + width := ((i ^ (i - 1)) + 1) >> 1 + slicesXor(work[i-width:i], work[i:i+width], &r.o) + } + + // work <- FFT(work, n, 0) truncated to m + dataShards + + outputCount := m + r.dataShards + + if LEO_ERROR_BITFIELD_OPT && useBits { + errorBits.fftDIT8(work, outputCount, n, fftSkew8[:], &r.o) + } else { + fftDIT8(work, outputCount, n, fftSkew8[:], &r.o) + } + + // Reveal erasures + // + // Original = -ErrLocator * FFT( Derivative( IFFT( ErrLocator * ReceivedData ) ) ) + // mul_mem(x, y, log_m, ) equals x[] = y[] * log_m + // + // mem layout: [Recovery Data (Power of Two = M)] [Original Data (K)] [Zero Padding out to N] + end := r.dataShards + if recoverAll { + end = r.totalShards + } + // Restore + for i := 0; i < end; i++ { + if len(sh[i]) != 0 { + continue + } + + if i >= r.dataShards { + // Parity shard. + mulgf8(shards[i][off:endSlice], work[i-r.dataShards], modulus8-errLocs[i-r.dataShards], &r.o) + } else { + // Data shard. + mulgf8(shards[i][off:endSlice], work[i+m], modulus8-errLocs[i+m], &r.o) + } + } + off += workSize8 + } + return nil +} + +// Basic no-frills version for decoder +func ifftDITDecoder8(mtrunc int, work [][]byte, m int, skewLUT []ffe8, o *options) { + // Decimation in time: Unroll 2 layers at a time + dist := 1 + dist4 := 4 + for dist4 <= m { + // For each set of dist*4 elements: + for r := 0; r < mtrunc; r += dist4 { + iend := r + dist + log_m01 := skewLUT[iend-1] + log_m02 := skewLUT[iend+dist-1] + log_m23 := skewLUT[iend+dist*2-1] + + // For each set of dist elements: + for i := r; i < iend; i++ { + ifftDIT48(work[i:], dist, log_m01, log_m23, log_m02, o) + } + } + dist = dist4 + dist4 <<= 2 + } + + // If there is one layer left: + if dist < m { + // Assuming that dist = m / 2 + if dist*2 != m { + panic("internal error") + } + + log_m := skewLUT[dist-1] + + if log_m == modulus8 { + slicesXor(work[dist:2*dist], work[:dist], o) + } else { + for i := 0; i < dist; i++ { + ifftDIT28( + work[i], + work[i+dist], + log_m, + o, + ) + } + } + } +} + +// In-place FFT for encoder and decoder +func fftDIT8(work [][]byte, mtrunc, m int, skewLUT []ffe8, o *options) { + // Decimation in time: Unroll 2 layers at a time + dist4 := m + dist := m >> 2 + for dist != 0 { + // For each set of dist*4 elements: + for r := 0; r < mtrunc; r += dist4 { + iend := r + dist + log_m01 := skewLUT[iend-1] + log_m02 := skewLUT[iend+dist-1] + log_m23 := skewLUT[iend+dist*2-1] + + // For each set of dist elements: + for i := r; i < iend; i++ { + fftDIT48( + work[i:], + dist, + log_m01, + log_m23, + log_m02, + o, + ) + } + } + dist4 = dist + dist >>= 2 + } + + // If there is one layer left: + if dist4 == 2 { + for r := 0; r < mtrunc; r += 2 { + log_m := skewLUT[r+1-1] + + if log_m == modulus8 { + sliceXor(work[r], work[r+1], o) + } else { + fftDIT28(work[r], work[r+1], log_m, o) + } + } + } +} + +// 4-way butterfly +func fftDIT4Ref8(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + // First layer: + if log_m02 == modulus8 { + sliceXor(work[0], work[dist*2], o) + sliceXor(work[dist], work[dist*3], o) + } else { + fftDIT28(work[0], work[dist*2], log_m02, o) + fftDIT28(work[dist], work[dist*3], log_m02, o) + } + + // Second layer: + if log_m01 == modulus8 { + sliceXor(work[0], work[dist], o) + } else { + fftDIT28(work[0], work[dist], log_m01, o) + } + + if log_m23 == modulus8 { + sliceXor(work[dist*2], work[dist*3], o) + } else { + fftDIT28(work[dist*2], work[dist*3], log_m23, o) + } +} + +// Unrolled IFFT for encoder +func ifftDITEncoder8(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m int, skewLUT []ffe8, o *options) { + // I tried rolling the memcpy/memset into the first layer of the FFT and + // found that it only yields a 4% performance improvement, which is not + // worth the extra complexity. + for i := 0; i < mtrunc; i++ { + copy(work[i], data[i]) + } + for i := mtrunc; i < m; i++ { + memclr(work[i]) + } + + // Decimation in time: Unroll 2 layers at a time + dist := 1 + dist4 := 4 + for dist4 <= m { + // For each set of dist*4 elements: + for r := 0; r < mtrunc; r += dist4 { + iend := r + dist + log_m01 := skewLUT[iend] + log_m02 := skewLUT[iend+dist] + log_m23 := skewLUT[iend+dist*2] + + // For each set of dist elements: + for i := r; i < iend; i++ { + ifftDIT48( + work[i:], + dist, + log_m01, + log_m23, + log_m02, + o, + ) + } + } + + dist = dist4 + dist4 <<= 2 + // I tried alternating sweeps left->right and right->left to reduce cache misses. + // It provides about 1% performance boost when done for both FFT and IFFT, so it + // does not seem to be worth the extra complexity. + } + + // If there is one layer left: + if dist < m { + // Assuming that dist = m / 2 + if dist*2 != m { + panic("internal error") + } + + logm := skewLUT[dist] + + if logm == modulus8 { + slicesXor(work[dist:dist*2], work[:dist], o) + } else { + for i := 0; i < dist; i++ { + ifftDIT28(work[i], work[i+dist], logm, o) + } + } + } + + // I tried unrolling this but it does not provide more than 5% performance + // improvement for 16-bit finite fields, so it's not worth the complexity. + if xorRes != nil { + slicesXor(xorRes[:m], work[:m], o) + } +} + +func ifftDIT4Ref8(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) { + // First layer: + if log_m01 == modulus8 { + sliceXor(work[0], work[dist], o) + } else { + ifftDIT28(work[0], work[dist], log_m01, o) + } + + if log_m23 == modulus8 { + sliceXor(work[dist*2], work[dist*3], o) + } else { + ifftDIT28(work[dist*2], work[dist*3], log_m23, o) + } + + // Second layer: + if log_m02 == modulus8 { + sliceXor(work[0], work[dist*2], o) + sliceXor(work[dist], work[dist*3], o) + } else { + ifftDIT28(work[0], work[dist*2], log_m02, o) + ifftDIT28(work[dist], work[dist*3], log_m02, o) + } +} + +// Reference version of muladd: x[] ^= y[] * log_m +func refMulAdd8(x, y []byte, log_m ffe8) { + lut := &mul8LUTs[log_m] + + for len(x) >= 64 { + // Assert sizes for no bounds checks in loop + src := y[:64] + dst := x[:len(src)] // Needed, but not checked... + for i, y1 := range src { + dst[i] ^= byte(lut.Value[y1]) + } + x = x[64:] + y = y[64:] + } +} + +// Reference version of mul: x[] = y[] * log_m +func refMul8(x, y []byte, log_m ffe8) { + lut := &mul8LUTs[log_m] + + for off := 0; off < len(x); off += 64 { + src := y[off : off+64] + for i, y1 := range src { + x[off+i] = byte(lut.Value[y1]) + } + } +} + +// Returns a * Log(b) +func mulLog8(a, log_b ffe8) ffe8 { + /* + Note that this operation is not a normal multiplication in a finite + field because the right operand is already a logarithm. This is done + because it moves K table lookups from the Decode() method into the + initialization step that is less performance critical. The LogWalsh[] + table below contains precalculated logarithms so it is easier to do + all the other multiplies in that form as well. + */ + if a == 0 { + return 0 + } + return expLUT8[addMod8(logLUT8[a], log_b)] +} + +// z = x + y (mod kModulus) +func addMod8(a, b ffe8) ffe8 { + sum := uint(a) + uint(b) + + // Partial reduction step, allowing for kModulus to be returned + return ffe8(sum + sum>>bitwidth8) +} + +// z = x - y (mod kModulus) +func subMod8(a, b ffe8) ffe8 { + dif := uint(a) - uint(b) + + // Partial reduction step, allowing for kModulus to be returned + return ffe8(dif + dif>>bitwidth8) +} + +// Decimation in time (DIT) Fast Walsh-Hadamard Transform +// Unrolls pairs of layers to perform cross-layer operations in registers +// mtrunc: Number of elements that are non-zero at the front of data +func fwht8(data *[order8]ffe8, m, mtrunc int) { + // Decimation in time: Unroll 2 layers at a time + dist := 1 + dist4 := 4 + for dist4 <= m { + // For each set of dist*4 elements: + for r := 0; r < mtrunc; r += dist4 { + // For each set of dist elements: + // Use 16 bit indices to avoid bounds check on [65536]ffe8. + dist := uint16(dist) + off := uint16(r) + for i := uint16(0); i < dist; i++ { + // fwht48(data[i:], dist) inlined... + // Reading values appear faster than updating pointers. + // Casting to uint is not faster. + t0 := data[off] + t1 := data[off+dist] + t2 := data[off+dist*2] + t3 := data[off+dist*3] + + t0, t1 = fwht2alt8(t0, t1) + t2, t3 = fwht2alt8(t2, t3) + t0, t2 = fwht2alt8(t0, t2) + t1, t3 = fwht2alt8(t1, t3) + + data[off] = t0 + data[off+dist] = t1 + data[off+dist*2] = t2 + data[off+dist*3] = t3 + off++ + } + } + dist = dist4 + dist4 <<= 2 + } + + // If there is one layer left: + if dist < m { + dist := uint16(dist) + for i := uint16(0); i < dist; i++ { + fwht28(&data[i], &data[i+dist]) + } + } +} + +func fwht48(data []ffe8, s int) { + s2 := s << 1 + + t0 := &data[0] + t1 := &data[s] + t2 := &data[s2] + t3 := &data[s2+s] + + fwht28(t0, t1) + fwht28(t2, t3) + fwht28(t0, t2) + fwht28(t1, t3) +} + +// {a, b} = {a + b, a - b} (Mod Q) +func fwht28(a, b *ffe8) { + sum := addMod8(*a, *b) + dif := subMod8(*a, *b) + *a = sum + *b = dif +} + +// fwht2alt8 is as fwht28, but returns result. +func fwht2alt8(a, b ffe8) (ffe8, ffe8) { + return addMod8(a, b), subMod8(a, b) +} + +var initOnce8 sync.Once + +func initConstants8() { + initOnce8.Do(func() { + initLUTs8() + initFFTSkew8() + initMul8LUT() + }) +} + +// Initialize logLUT8, expLUT8. +func initLUTs8() { + cantorBasis := [bitwidth8]ffe8{ + 1, 214, 152, 146, 86, 200, 88, 230, + } + + expLUT8 = &[order8]ffe8{} + logLUT8 = &[order8]ffe8{} + + // LFSR table generation: + state := 1 + for i := ffe8(0); i < modulus8; i++ { + expLUT8[state] = i + state <<= 1 + if state >= order8 { + state ^= polynomial8 + } + } + expLUT8[0] = modulus8 + + // Conversion to Cantor basis: + + logLUT8[0] = 0 + for i := 0; i < bitwidth8; i++ { + basis := cantorBasis[i] + width := 1 << i + + for j := 0; j < width; j++ { + logLUT8[j+width] = logLUT8[j] ^ basis + } + } + + for i := 0; i < order8; i++ { + logLUT8[i] = expLUT8[logLUT8[i]] + } + + for i := 0; i < order8; i++ { + expLUT8[logLUT8[i]] = ffe8(i) + } + + expLUT8[modulus8] = expLUT8[0] +} + +// Initialize fftSkew8. +func initFFTSkew8() { + var temp [bitwidth8 - 1]ffe8 + + // Generate FFT skew vector {1}: + + for i := 1; i < bitwidth8; i++ { + temp[i-1] = ffe8(1 << i) + } + + fftSkew8 = &[modulus8]ffe8{} + logWalsh8 = &[order8]ffe8{} + + for m := 0; m < bitwidth8-1; m++ { + step := 1 << (m + 1) + + fftSkew8[1<>4)+16)] + } + } + // Always initialize assembly tables. + // Not as big resource hog as gf16. + if true { + multiply256LUT8 = &[order8][16 * 2]byte{} + + for logM := range multiply256LUT8[:] { + // For each 4 bits of the finite field width in bits: + shift := 0 + for i := 0; i < 2; i++ { + // Construct 16 entry LUT for PSHUFB + prod := multiply256LUT8[logM][i*16 : i*16+16] + for x := range prod[:] { + prod[x] = byte(mulLog8(ffe8(x<= 8 || mipLevel <= 0 { + return true + } + return 0 != (e.Words[mipLevel-1][bit/64] & (uint64(1) << (bit & 63))) +} + +func (e *errorBitfield8) prepare() { + // First mip level is for final layer of FFT: pairs of data + for i := 0; i < kWords8; i++ { + w_i := e.Words[0][i] + hi2lo0 := w_i | ((w_i & kHiMasks[0]) >> 1) + lo2hi0 := (w_i & (kHiMasks[0] >> 1)) << 1 + w_i = hi2lo0 | lo2hi0 + e.Words[0][i] = w_i + + bits := 2 + for j := 1; j < 5; j++ { + hi2lo_j := w_i | ((w_i & kHiMasks[j]) >> bits) + lo2hi_j := (w_i & (kHiMasks[j] >> bits)) << bits + w_i = hi2lo_j | lo2hi_j + e.Words[j][i] = w_i + bits <<= 1 + } + } + + for i := 0; i < kWords8; i++ { + w := e.Words[4][i] + w |= w >> 32 + w |= w << 32 + e.Words[5][i] = w + } + + for i := 0; i < kWords8; i += 2 { + t := e.Words[5][i] | e.Words[5][i+1] + e.Words[6][i] = t + e.Words[6][i+1] = t + } +} + +func (e *errorBitfield8) fftDIT8(work [][]byte, mtrunc, m int, skewLUT []ffe8, o *options) { + // Decimation in time: Unroll 2 layers at a time + mipLevel := bits.Len32(uint32(m)) - 1 + + dist4 := m + dist := m >> 2 + for dist != 0 { + // For each set of dist*4 elements: + for r := 0; r < mtrunc; r += dist4 { + if !e.isNeeded(mipLevel, r) { + continue + } + iEnd := r + dist + logM01 := skewLUT[iEnd-1] + logM02 := skewLUT[iEnd+dist-1] + logM23 := skewLUT[iEnd+dist*2-1] + + // For each set of dist elements: + for i := r; i < iEnd; i++ { + fftDIT48( + work[i:], + dist, + logM01, + logM23, + logM02, + o) + } + } + dist4 = dist + dist >>= 2 + mipLevel -= 2 + } + + // If there is one layer left: + if dist4 == 2 { + for r := 0; r < mtrunc; r += 2 { + if !e.isNeeded(mipLevel, r) { + continue + } + logM := skewLUT[r+1-1] + + if logM == modulus8 { + sliceXor(work[r], work[r+1], o) + } else { + fftDIT28(work[r], work[r+1], logM, o) + } + } + } +} diff --git a/vendor/github.com/klauspost/reedsolomon/matrix.go b/vendor/github.com/klauspost/reedsolomon/matrix.go new file mode 100644 index 000000000..bfdcca66f --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/matrix.go @@ -0,0 +1,281 @@ +/** + * Matrix Algebra over an 8-bit Galois Field + * + * Copyright 2015, Klaus Post + * Copyright 2015, Backblaze, Inc. + */ + +package reedsolomon + +import ( + "errors" + "fmt" + "strconv" + "strings" +) + +// byte[row][col] +type matrix [][]byte + +// newMatrix returns a matrix of zeros. +func newMatrix(rows, cols int) (matrix, error) { + if rows <= 0 { + return nil, errInvalidRowSize + } + if cols <= 0 { + return nil, errInvalidColSize + } + + m := matrix(make([][]byte, rows)) + for i := range m { + m[i] = make([]byte, cols) + } + return m, nil +} + +// NewMatrixData initializes a matrix with the given row-major data. +// Note that data is not copied from input. +func newMatrixData(data [][]byte) (matrix, error) { + m := matrix(data) + err := m.Check() + if err != nil { + return nil, err + } + return m, nil +} + +// IdentityMatrix returns an identity matrix of the given size. +func identityMatrix(size int) (matrix, error) { + m, err := newMatrix(size, size) + if err != nil { + return nil, err + } + for i := range m { + m[i][i] = 1 + } + return m, nil +} + +// errInvalidRowSize will be returned if attempting to create a matrix with negative or zero row number. +var errInvalidRowSize = errors.New("invalid row size") + +// errInvalidColSize will be returned if attempting to create a matrix with negative or zero column number. +var errInvalidColSize = errors.New("invalid column size") + +// errColSizeMismatch is returned if the size of matrix columns mismatch. +var errColSizeMismatch = errors.New("column size is not the same for all rows") + +func (m matrix) Check() error { + rows := len(m) + if rows == 0 { + return errInvalidRowSize + } + cols := len(m[0]) + if cols == 0 { + return errInvalidColSize + } + + for _, col := range m { + if len(col) != cols { + return errColSizeMismatch + } + } + return nil +} + +// String returns a human-readable string of the matrix contents. +// +// Example: [[1, 2], [3, 4]] +func (m matrix) String() string { + rowOut := make([]string, 0, len(m)) + for _, row := range m { + colOut := make([]string, 0, len(row)) + for _, col := range row { + colOut = append(colOut, strconv.Itoa(int(col))) + } + rowOut = append(rowOut, "["+strings.Join(colOut, ", ")+"]") + } + return "[" + strings.Join(rowOut, ", ") + "]" +} + +// Multiply multiplies this matrix (the one on the left) by another +// matrix (the one on the right) and returns a new matrix with the result. +func (m matrix) Multiply(right matrix) (matrix, error) { + if len(m[0]) != len(right) { + return nil, fmt.Errorf("columns on left (%d) is different than rows on right (%d)", len(m[0]), len(right)) + } + result, _ := newMatrix(len(m), len(right[0])) + for r, row := range result { + for c := range row { + var value byte + for i := range m[0] { + value ^= galMultiply(m[r][i], right[i][c]) + } + result[r][c] = value + } + } + return result, nil +} + +// Augment returns the concatenation of this matrix and the matrix on the right. +func (m matrix) Augment(right matrix) (matrix, error) { + if len(m) != len(right) { + return nil, errMatrixSize + } + + result, _ := newMatrix(len(m), len(m[0])+len(right[0])) + for r, row := range m { + for c := range row { + result[r][c] = m[r][c] + } + cols := len(m[0]) + for c := range right[0] { + result[r][cols+c] = right[r][c] + } + } + return result, nil +} + +// errMatrixSize is returned if matrix dimensions are doesn't match. +var errMatrixSize = errors.New("matrix sizes do not match") + +func (m matrix) SameSize(n matrix) error { + if len(m) != len(n) { + return errMatrixSize + } + for i := range m { + if len(m[i]) != len(n[i]) { + return errMatrixSize + } + } + return nil +} + +// SubMatrix returns a part of this matrix. Data is copied. +func (m matrix) SubMatrix(rmin, cmin, rmax, cmax int) (matrix, error) { + result, err := newMatrix(rmax-rmin, cmax-cmin) + if err != nil { + return nil, err + } + // OPTME: If used heavily, use copy function to copy slice + for r := rmin; r < rmax; r++ { + for c := cmin; c < cmax; c++ { + result[r-rmin][c-cmin] = m[r][c] + } + } + return result, nil +} + +// SwapRows Exchanges two rows in the matrix. +func (m matrix) SwapRows(r1, r2 int) error { + if r1 < 0 || len(m) <= r1 || r2 < 0 || len(m) <= r2 { + return errInvalidRowSize + } + m[r2], m[r1] = m[r1], m[r2] + return nil +} + +// IsSquare will return true if the matrix is square, otherwise false. +func (m matrix) IsSquare() bool { + return len(m) == len(m[0]) +} + +// errSingular is returned if the matrix is singular and cannot be inversed +var errSingular = errors.New("matrix is singular") + +// errNotSquare is returned if attempting to inverse a non-square matrix. +var errNotSquare = errors.New("only square matrices can be inverted") + +// Invert returns the inverse of this matrix. +// Returns ErrSingular when the matrix is singular and doesn't have an inverse. +// The matrix must be square, otherwise ErrNotSquare is returned. +func (m matrix) Invert() (matrix, error) { + if !m.IsSquare() { + return nil, errNotSquare + } + + size := len(m) + work, _ := identityMatrix(size) + work, _ = m.Augment(work) + + err := work.gaussianElimination() + if err != nil { + return nil, err + } + + return work.SubMatrix(0, size, size, size*2) +} + +func (m matrix) gaussianElimination() error { + rows := len(m) + columns := len(m[0]) + // Clear out the part below the main diagonal and scale the main + // diagonal to be 1. + for r := 0; r < rows; r++ { + // If the element on the diagonal is 0, find a row below + // that has a non-zero and swap them. + if m[r][r] == 0 { + for rowBelow := r + 1; rowBelow < rows; rowBelow++ { + if m[rowBelow][r] != 0 { + err := m.SwapRows(r, rowBelow) + if err != nil { + return err + } + break + } + } + } + // If we couldn't find one, the matrix is singular. + if m[r][r] == 0 { + return errSingular + } + // Scale to 1. + if m[r][r] != 1 { + scale := galOneOver(m[r][r]) + for c := 0; c < columns; c++ { + m[r][c] = galMultiply(m[r][c], scale) + } + } + // Make everything below the 1 be a 0 by subtracting + // a multiple of it. (Subtraction and addition are + // both exclusive or in the Galois field.) + for rowBelow := r + 1; rowBelow < rows; rowBelow++ { + if m[rowBelow][r] != 0 { + scale := m[rowBelow][r] + for c := 0; c < columns; c++ { + m[rowBelow][c] ^= galMultiply(scale, m[r][c]) + } + } + } + } + + // Now clear the part above the main diagonal. + for d := 0; d < rows; d++ { + for rowAbove := 0; rowAbove < d; rowAbove++ { + if m[rowAbove][d] != 0 { + scale := m[rowAbove][d] + for c := 0; c < columns; c++ { + m[rowAbove][c] ^= galMultiply(scale, m[d][c]) + } + + } + } + } + return nil +} + +// Create a Vandermonde matrix, which is guaranteed to have the +// property that any subset of rows that forms a square matrix +// is invertible. +func vandermonde(rows, cols int) (matrix, error) { + result, err := newMatrix(rows, cols) + if err != nil { + return nil, err + } + for r, row := range result { + for c := range row { + result[r][c] = galExp(byte(r), c) + } + } + return result, nil +} diff --git a/vendor/github.com/klauspost/reedsolomon/options.go b/vendor/github.com/klauspost/reedsolomon/options.go new file mode 100644 index 000000000..73cc7d6d2 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/options.go @@ -0,0 +1,323 @@ +package reedsolomon + +import ( + "runtime" + "strings" + + "github.com/klauspost/cpuid/v2" +) + +// Option allows to override processing parameters. +type Option func(*options) + +type options struct { + maxGoroutines int + minSplitSize int + shardSize int + perRound int + + useAvxGNFI, + useAvx512GFNI, + useAVX512, + useAVX2, + useSSSE3, + useSSE2 bool + + useJerasureMatrix bool + usePAR1Matrix bool + useCauchy bool + fastOneParity bool + inversionCache bool + forcedInversionCache bool + customMatrix [][]byte + withLeopard leopardMode + + // stream options + concReads bool + concWrites bool + streamBS int +} + +var defaultOptions = options{ + maxGoroutines: 384, + minSplitSize: -1, + fastOneParity: false, + inversionCache: true, + + // Detect CPU capabilities. + useSSSE3: cpuid.CPU.Supports(cpuid.SSSE3), + useSSE2: cpuid.CPU.Supports(cpuid.SSE2), + useAVX2: cpuid.CPU.Supports(cpuid.AVX2), + useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), + useAvx512GFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ), + useAvxGNFI: cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI), +} + +// leopardMode controls the use of leopard GF in encoding and decoding. +type leopardMode int + +const ( + // leopardAsNeeded only switches to leopard 16-bit when there are more than + // 256 shards. + leopardAsNeeded leopardMode = iota + // leopardGF16 uses leopard in 16-bit mode for all shard counts. + leopardGF16 + // leopardAlways uses 8-bit leopard for shards less than or equal to 256, + // 16-bit leopard otherwise. + leopardAlways +) + +func init() { + if runtime.GOMAXPROCS(0) <= 1 { + defaultOptions.maxGoroutines = 1 + } +} + +// WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding. +// Jobs will be split into this many parts, unless each goroutine would have to process +// less than minSplitSize bytes (set with WithMinSplitSize). +// For the best speed, keep this well above the GOMAXPROCS number for more fine grained +// scheduling. +// If n <= 0, it is ignored. +func WithMaxGoroutines(n int) Option { + return func(o *options) { + if n > 0 { + o.maxGoroutines = n + } + } +} + +// WithAutoGoroutines will adjust the number of goroutines for optimal speed with a +// specific shard size. +// Send in the shard size you expect to send. Other shard sizes will work, but may not +// run at the optimal speed. +// Overwrites WithMaxGoroutines. +// If shardSize <= 0, it is ignored. +func WithAutoGoroutines(shardSize int) Option { + return func(o *options) { + o.shardSize = shardSize + } +} + +// WithMinSplitSize is the minimum encoding size in bytes per goroutine. +// By default this parameter is determined by CPU cache characteristics. +// See WithMaxGoroutines on how jobs are split. +// If n <= 0, it is ignored. +func WithMinSplitSize(n int) Option { + return func(o *options) { + if n > 0 { + o.minSplitSize = n + } + } +} + +// WithConcurrentStreams will enable concurrent reads and writes on the streams. +// Default: Disabled, meaning only one stream will be read/written at the time. +// Ignored if not used on a stream input. +func WithConcurrentStreams(enabled bool) Option { + return func(o *options) { + o.concReads, o.concWrites = enabled, enabled + } +} + +// WithConcurrentStreamReads will enable concurrent reads from the input streams. +// Default: Disabled, meaning only one stream will be read at the time. +// Ignored if not used on a stream input. +func WithConcurrentStreamReads(enabled bool) Option { + return func(o *options) { + o.concReads = enabled + } +} + +// WithConcurrentStreamWrites will enable concurrent writes to the the output streams. +// Default: Disabled, meaning only one stream will be written at the time. +// Ignored if not used on a stream input. +func WithConcurrentStreamWrites(enabled bool) Option { + return func(o *options) { + o.concWrites = enabled + } +} + +// WithInversionCache allows to control the inversion cache. +// This will cache reconstruction matrices so they can be reused. +// Enabled by default, or <= 64 shards for Leopard encoding. +func WithInversionCache(enabled bool) Option { + return func(o *options) { + o.inversionCache = enabled + o.forcedInversionCache = true + } +} + +// WithStreamBlockSize allows to set a custom block size per round of reads/writes. +// If not set, any shard size set with WithAutoGoroutines will be used. +// If WithAutoGoroutines is also unset, 4MB will be used. +// Ignored if not used on stream. +func WithStreamBlockSize(n int) Option { + return func(o *options) { + o.streamBS = n + } +} + +// WithSSSE3 allows to enable/disable SSSE3 instructions. +// If not set, SSSE3 will be turned on or off automatically based on CPU ID information. +func WithSSSE3(enabled bool) Option { + return func(o *options) { + o.useSSSE3 = enabled + } +} + +// WithAVX2 allows to enable/disable AVX2 instructions. +// If not set, AVX will be turned on or off automatically based on CPU ID information. +// This will also disable AVX GFNI instructions. +func WithAVX2(enabled bool) Option { + return func(o *options) { + o.useAVX2 = enabled + if o.useAvxGNFI { + o.useAvxGNFI = enabled + } + } +} + +// WithSSE2 allows to enable/disable SSE2 instructions. +// If not set, SSE2 will be turned on or off automatically based on CPU ID information. +func WithSSE2(enabled bool) Option { + return func(o *options) { + o.useSSE2 = enabled + } +} + +// WithAVX512 allows to enable/disable AVX512 (and GFNI) instructions. +func WithAVX512(enabled bool) Option { + return func(o *options) { + o.useAVX512 = enabled + o.useAvx512GFNI = enabled + } +} + +// WithGFNI allows to enable/disable AVX512+GFNI instructions. +// If not set, GFNI will be turned on or off automatically based on CPU ID information. +func WithGFNI(enabled bool) Option { + return func(o *options) { + o.useAvx512GFNI = enabled + } +} + +// WithAVXGFNI allows to enable/disable GFNI with AVX instructions. +// If not set, GFNI will be turned on or off automatically based on CPU ID information. +func WithAVXGFNI(enabled bool) Option { + return func(o *options) { + o.useAvxGNFI = enabled + } +} + +// WithJerasureMatrix causes the encoder to build the Reed-Solomon-Vandermonde +// matrix in the same way as done by the Jerasure library. +// The first row and column of the coding matrix only contains 1's in this method +// so the first parity chunk is always equal to XOR of all data chunks. +func WithJerasureMatrix() Option { + return func(o *options) { + o.useJerasureMatrix = true + o.usePAR1Matrix = false + o.useCauchy = false + } +} + +// WithPAR1Matrix causes the encoder to build the matrix how PARv1 +// does. Note that the method they use is buggy, and may lead to cases +// where recovery is impossible, even if there are enough parity +// shards. +func WithPAR1Matrix() Option { + return func(o *options) { + o.useJerasureMatrix = false + o.usePAR1Matrix = true + o.useCauchy = false + } +} + +// WithCauchyMatrix will make the encoder build a Cauchy style matrix. +// The output of this is not compatible with the standard output. +// A Cauchy matrix is faster to generate. This does not affect data throughput, +// but will result in slightly faster start-up time. +func WithCauchyMatrix() Option { + return func(o *options) { + o.useJerasureMatrix = false + o.usePAR1Matrix = false + o.useCauchy = true + } +} + +// WithFastOneParityMatrix will switch the matrix to a simple xor +// if there is only one parity shard. +// The PAR1 matrix already has this property so it has little effect there. +func WithFastOneParityMatrix() Option { + return func(o *options) { + o.fastOneParity = true + } +} + +// WithCustomMatrix causes the encoder to use the manually specified matrix. +// customMatrix represents only the parity chunks. +// customMatrix must have at least ParityShards rows and DataShards columns. +// It can be used for interoperability with libraries which generate +// the matrix differently or to implement more complex coding schemes like LRC +// (locally reconstructible codes). +func WithCustomMatrix(customMatrix [][]byte) Option { + return func(o *options) { + o.customMatrix = customMatrix + } +} + +// WithLeopardGF16 will always use leopard GF16 for encoding, +// even when there is less than 256 shards. +// This will likely improve reconstruction time for some setups. +// This is not compatible with Leopard output for <= 256 shards. +// Note that Leopard places certain restrictions on use see other documentation. +func WithLeopardGF16(enabled bool) Option { + return func(o *options) { + if enabled { + o.withLeopard = leopardGF16 + } else { + o.withLeopard = leopardAsNeeded + } + } +} + +// WithLeopardGF will use leopard GF for encoding, even when there are fewer than +// 256 shards. +// This will likely improve reconstruction time for some setups. +// Note that Leopard places certain restrictions on use see other documentation. +func WithLeopardGF(enabled bool) Option { + return func(o *options) { + if enabled { + o.withLeopard = leopardAlways + } else { + o.withLeopard = leopardAsNeeded + } + } +} + +func (o *options) cpuOptions() string { + var res []string + if o.useSSE2 { + res = append(res, "SSE2") + } + if o.useAVX2 { + res = append(res, "AVX2") + } + if o.useSSSE3 { + res = append(res, "SSSE3") + } + if o.useAVX512 { + res = append(res, "AVX512") + } + if o.useAvx512GFNI { + res = append(res, "AVX512+GFNI") + } + if o.useAvxGNFI { + res = append(res, "AVX+GFNI") + } + if len(res) == 0 { + return "pure Go" + } + return strings.Join(res, ",") +} diff --git a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go new file mode 100644 index 000000000..bebba0445 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go @@ -0,0 +1,1741 @@ +/** + * Reed-Solomon Coding over 8-bit values. + * + * Copyright 2015, Klaus Post + * Copyright 2015, Backblaze, Inc. + */ + +// Package reedsolomon enables Erasure Coding in Go +// +// For usage and examples, see https://github.com/klauspost/reedsolomon +package reedsolomon + +import ( + "bytes" + "errors" + "fmt" + "io" + "runtime" + "sync" + + "github.com/klauspost/cpuid/v2" +) + +// Encoder is an interface to encode Reed-Salomon parity sets for your data. +type Encoder interface { + // Encode parity for a set of data shards. + // Input is 'shards' containing data shards followed by parity shards. + // The number of shards must match the number given to New(). + // Each shard is a byte array, and they must all be the same size. + // The parity shards will always be overwritten and the data shards + // will remain the same, so it is safe for you to read from the + // data shards while this is running. + Encode(shards [][]byte) error + + // EncodeIdx will add parity for a single data shard. + // Parity shards should start out as 0. The caller must zero them. + // Data shards must be delivered exactly once. There is no check for this. + // The parity shards will always be updated and the data shards will remain the same. + EncodeIdx(dataShard []byte, idx int, parity [][]byte) error + + // Verify returns true if the parity shards contain correct data. + // The data is the same format as Encode. No data is modified, so + // you are allowed to read from data while this is running. + Verify(shards [][]byte) (bool, error) + + // Reconstruct will recreate the missing shards if possible. + // + // Given a list of shards, some of which contain data, fills in the + // ones that don't have data. + // + // The length of the array must be equal to the total number of shards. + // You indicate that a shard is missing by setting it to nil or zero-length. + // If a shard is zero-length but has sufficient capacity, that memory will + // be used, otherwise a new []byte will be allocated. + // + // If there are too few shards to reconstruct the missing + // ones, ErrTooFewShards will be returned. + // + // The reconstructed shard set is complete, but integrity is not verified. + // Use the Verify function to check if data set is ok. + Reconstruct(shards [][]byte) error + + // ReconstructData will recreate any missing data shards, if possible. + // + // Given a list of shards, some of which contain data, fills in the + // data shards that don't have data. + // + // The length of the array must be equal to Shards. + // You indicate that a shard is missing by setting it to nil or zero-length. + // If a shard is zero-length but has sufficient capacity, that memory will + // be used, otherwise a new []byte will be allocated. + // + // If there are too few shards to reconstruct the missing + // ones, ErrTooFewShards will be returned. + // + // As the reconstructed shard set may contain missing parity shards, + // calling the Verify function is likely to fail. + ReconstructData(shards [][]byte) error + + // ReconstructSome will recreate only requested shards, if possible. + // + // Given a list of shards, some of which contain data, fills in the + // shards indicated by true values in the "required" parameter. + // The length of the "required" array must be equal to either Shards or DataShards. + // If the length is equal to DataShards, the reconstruction of parity shards will be ignored. + // + // The length of "shards" array must be equal to Shards. + // You indicate that a shard is missing by setting it to nil or zero-length. + // If a shard is zero-length but has sufficient capacity, that memory will + // be used, otherwise a new []byte will be allocated. + // + // If there are too few shards to reconstruct the missing + // ones, ErrTooFewShards will be returned. + // + // As the reconstructed shard set may contain missing parity shards, + // calling the Verify function is likely to fail. + ReconstructSome(shards [][]byte, required []bool) error + + // Update parity is use for change a few data shards and update it's parity. + // Input 'newDatashards' containing data shards changed. + // Input 'shards' containing old data shards (if data shard not changed, it can be nil) and old parity shards. + // new parity shards will in shards[DataShards:] + // Update is very useful if DataShards much larger than ParityShards and changed data shards is few. It will + // faster than Encode and not need read all data shards to encode. + Update(shards [][]byte, newDatashards [][]byte) error + + // Split a data slice into the number of shards given to the encoder, + // and create empty parity shards if necessary. + // + // The data will be split into equally sized shards. + // If the data size isn't divisible by the number of shards, + // the last shard will contain extra zeros. + // + // If there is extra capacity on the provided data slice + // it will be used instead of allocating parity shards. + // It will be zeroed out. + // + // There must be at least 1 byte otherwise ErrShortData will be + // returned. + // + // The data will not be copied, except for the last shard, so you + // should not modify the data of the input slice afterwards. + Split(data []byte) ([][]byte, error) + + // Join the shards and write the data segment to dst. + // + // Only the data shards are considered. + // You must supply the exact output size you want. + // If there are to few shards given, ErrTooFewShards will be returned. + // If the total data size is less than outSize, ErrShortData will be returned. + Join(dst io.Writer, shards [][]byte, outSize int) error +} + +// Extensions is an optional interface. +// All returned instances will support this interface. +type Extensions interface { + // ShardSizeMultiple will return the size the shard sizes must be a multiple of. + ShardSizeMultiple() int + + // DataShards will return the number of data shards. + DataShards() int + + // ParityShards will return the number of parity shards. + ParityShards() int + + // TotalShards will return the total number of shards. + TotalShards() int + + // AllocAligned will allocate TotalShards number of slices, + // aligned to reasonable memory sizes. + // Provide the size of each shard. + AllocAligned(each int) [][]byte +} + +const ( + avx2CodeGenMinSize = 64 + avx2CodeGenMinShards = 3 + avx2CodeGenMaxGoroutines = 8 + gfniCodeGenMaxGoroutines = 4 + + intSize = 32 << (^uint(0) >> 63) // 32 or 64 + maxInt = 1<<(intSize-1) - 1 +) + +// reedSolomon contains a matrix for a specific +// distribution of datashards and parity shards. +// Construct if using New() +type reedSolomon struct { + dataShards int // Number of data shards, should not be modified. + parityShards int // Number of parity shards, should not be modified. + totalShards int // Total number of shards. Calculated, and should not be modified. + m matrix + tree *inversionTree + parity [][]byte + o options + mPoolSz int + mPool sync.Pool // Pool for temp matrices, etc +} + +var _ = Extensions(&reedSolomon{}) + +func (r *reedSolomon) ShardSizeMultiple() int { + return 1 +} + +func (r *reedSolomon) DataShards() int { + return r.dataShards +} + +func (r *reedSolomon) ParityShards() int { + return r.parityShards +} + +func (r *reedSolomon) TotalShards() int { + return r.totalShards +} + +func (r *reedSolomon) AllocAligned(each int) [][]byte { + return AllocAligned(r.totalShards, each) +} + +// ErrInvShardNum will be returned by New, if you attempt to create +// an Encoder with less than one data shard or less than zero parity +// shards. +var ErrInvShardNum = errors.New("cannot create Encoder with less than one data shard or less than zero parity shards") + +// ErrMaxShardNum will be returned by New, if you attempt to create an +// Encoder where data and parity shards are bigger than the order of +// GF(2^8). +var ErrMaxShardNum = errors.New("cannot create Encoder with more than 256 data+parity shards") + +// ErrNotSupported is returned when an operation is not supported. +var ErrNotSupported = errors.New("operation not supported") + +// buildMatrix creates the matrix to use for encoding, given the +// number of data shards and the number of total shards. +// +// The top square of the matrix is guaranteed to be an identity +// matrix, which means that the data shards are unchanged after +// encoding. +func buildMatrix(dataShards, totalShards int) (matrix, error) { + // Start with a Vandermonde matrix. This matrix would work, + // in theory, but doesn't have the property that the data + // shards are unchanged after encoding. + vm, err := vandermonde(totalShards, dataShards) + if err != nil { + return nil, err + } + + // Multiply by the inverse of the top square of the matrix. + // This will make the top square be the identity matrix, but + // preserve the property that any square subset of rows is + // invertible. + top, err := vm.SubMatrix(0, 0, dataShards, dataShards) + if err != nil { + return nil, err + } + + topInv, err := top.Invert() + if err != nil { + return nil, err + } + + return vm.Multiply(topInv) +} + +// buildMatrixJerasure creates the same encoding matrix as Jerasure library +// +// The top square of the matrix is guaranteed to be an identity +// matrix, which means that the data shards are unchanged after +// encoding. +func buildMatrixJerasure(dataShards, totalShards int) (matrix, error) { + // Start with a Vandermonde matrix. This matrix would work, + // in theory, but doesn't have the property that the data + // shards are unchanged after encoding. + vm, err := vandermonde(totalShards, dataShards) + if err != nil { + return nil, err + } + + // Jerasure does this: + // first row is always 100..00 + vm[0][0] = 1 + for i := 1; i < dataShards; i++ { + vm[0][i] = 0 + } + // last row is always 000..01 + for i := 0; i < dataShards-1; i++ { + vm[totalShards-1][i] = 0 + } + vm[totalShards-1][dataShards-1] = 1 + + for i := 0; i < dataShards; i++ { + // Find the row where i'th col is not 0 + r := i + for ; r < totalShards && vm[r][i] == 0; r++ { + } + if r != i { + // Swap it with i'th row if not already + t := vm[r] + vm[r] = vm[i] + vm[i] = t + } + // Multiply by the inverted matrix (same as vm.Multiply(vm[0:dataShards].Invert())) + if vm[i][i] != 1 { + // Make vm[i][i] = 1 by dividing the column by vm[i][i] + tmp := galOneOver(vm[i][i]) + for j := 0; j < totalShards; j++ { + vm[j][i] = galMultiply(vm[j][i], tmp) + } + } + for j := 0; j < dataShards; j++ { + // Make vm[i][j] = 0 where j != i by adding vm[i][j]*vm[.][i] to each column + tmp := vm[i][j] + if j != i && tmp != 0 { + for r := 0; r < totalShards; r++ { + vm[r][j] = galAdd(vm[r][j], galMultiply(tmp, vm[r][i])) + } + } + } + } + + // Make vm[dataShards] row all ones - divide each column j by vm[dataShards][j] + for j := 0; j < dataShards; j++ { + tmp := vm[dataShards][j] + if tmp != 1 { + tmp = galOneOver(tmp) + for i := dataShards; i < totalShards; i++ { + vm[i][j] = galMultiply(vm[i][j], tmp) + } + } + } + + // Make vm[dataShards...totalShards-1][0] column all ones - divide each row + for i := dataShards + 1; i < totalShards; i++ { + tmp := vm[i][0] + if tmp != 1 { + tmp = galOneOver(tmp) + for j := 0; j < dataShards; j++ { + vm[i][j] = galMultiply(vm[i][j], tmp) + } + } + } + + return vm, nil +} + +// buildMatrixPAR1 creates the matrix to use for encoding according to +// the PARv1 spec, given the number of data shards and the number of +// total shards. Note that the method they use is buggy, and may lead +// to cases where recovery is impossible, even if there are enough +// parity shards. +// +// The top square of the matrix is guaranteed to be an identity +// matrix, which means that the data shards are unchanged after +// encoding. +func buildMatrixPAR1(dataShards, totalShards int) (matrix, error) { + result, err := newMatrix(totalShards, dataShards) + if err != nil { + return nil, err + } + + for r, row := range result { + // The top portion of the matrix is the identity + // matrix, and the bottom is a transposed Vandermonde + // matrix starting at 1 instead of 0. + if r < dataShards { + result[r][r] = 1 + } else { + for c := range row { + result[r][c] = galExp(byte(c+1), r-dataShards) + } + } + } + return result, nil +} + +func buildMatrixCauchy(dataShards, totalShards int) (matrix, error) { + result, err := newMatrix(totalShards, dataShards) + if err != nil { + return nil, err + } + + for r, row := range result { + // The top portion of the matrix is the identity + // matrix, and the bottom is a transposed Cauchy matrix. + if r < dataShards { + result[r][r] = 1 + } else { + for c := range row { + result[r][c] = invTable[(byte(r ^ c))] + } + } + } + return result, nil +} + +// buildXorMatrix can be used to build a matrix with pure XOR +// operations if there is only one parity shard. +func buildXorMatrix(dataShards, totalShards int) (matrix, error) { + if dataShards+1 != totalShards { + return nil, errors.New("internal error") + } + result, err := newMatrix(totalShards, dataShards) + if err != nil { + return nil, err + } + + for r, row := range result { + // The top portion of the matrix is the identity + // matrix. + if r < dataShards { + result[r][r] = 1 + } else { + // Set all values to 1 (XOR) + for c := range row { + result[r][c] = 1 + } + } + } + return result, nil +} + +// New creates a new encoder and initializes it to +// the number of data shards and parity shards that +// you want to use. You can reuse this encoder. +// Note that the maximum number of total shards is 65536, with some +// restrictions for a total larger than 256: +// +// - Shard sizes must be multiple of 64 +// - The methods Join/Split/Update/EncodeIdx are not supported +// +// If no options are supplied, default options are used. +func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { + o := defaultOptions + for _, opt := range opts { + opt(&o) + } + + totShards := dataShards + parityShards + switch { + case o.withLeopard == leopardGF16 && parityShards > 0 || totShards > 256: + return newFF16(dataShards, parityShards, o) + case o.withLeopard == leopardAlways && parityShards > 0: + return newFF8(dataShards, parityShards, o) + } + if totShards > 256 { + return nil, ErrMaxShardNum + } + + r := reedSolomon{ + dataShards: dataShards, + parityShards: parityShards, + totalShards: dataShards + parityShards, + o: o, + } + + if dataShards <= 0 || parityShards < 0 { + return nil, ErrInvShardNum + } + + if parityShards == 0 { + return &r, nil + } + + var err error + switch { + case r.o.customMatrix != nil: + if len(r.o.customMatrix) < parityShards { + return nil, errors.New("coding matrix must contain at least parityShards rows") + } + r.m = make([][]byte, r.totalShards) + for i := 0; i < dataShards; i++ { + r.m[i] = make([]byte, dataShards) + r.m[i][i] = 1 + } + for k, row := range r.o.customMatrix { + if len(row) < dataShards { + return nil, errors.New("coding matrix must contain at least dataShards columns") + } + r.m[dataShards+k] = make([]byte, dataShards) + copy(r.m[dataShards+k], row) + } + case r.o.fastOneParity && parityShards == 1: + r.m, err = buildXorMatrix(dataShards, r.totalShards) + case r.o.useCauchy: + r.m, err = buildMatrixCauchy(dataShards, r.totalShards) + case r.o.usePAR1Matrix: + r.m, err = buildMatrixPAR1(dataShards, r.totalShards) + case r.o.useJerasureMatrix: + r.m, err = buildMatrixJerasure(dataShards, r.totalShards) + default: + r.m, err = buildMatrix(dataShards, r.totalShards) + } + if err != nil { + return nil, err + } + + // Calculate what we want per round + r.o.perRound = cpuid.CPU.Cache.L2 + if r.o.perRound < 128<<10 { + r.o.perRound = 128 << 10 + } + + divide := parityShards + 1 + if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) { + // Base on L1 cache if we have many inputs. + r.o.perRound = cpuid.CPU.Cache.L1D + if r.o.perRound < 32<<10 { + r.o.perRound = 32 << 10 + } + divide = 0 + if dataShards > maxAvx2Inputs { + divide += maxAvx2Inputs + } else { + divide += dataShards + } + if parityShards > maxAvx2Inputs { + divide += maxAvx2Outputs + } else { + divide += parityShards + } + } + + if cpuid.CPU.ThreadsPerCore > 1 && r.o.maxGoroutines > cpuid.CPU.PhysicalCores { + // If multiple threads per core, make sure they don't contend for cache. + r.o.perRound /= cpuid.CPU.ThreadsPerCore + } + + // 1 input + parity must fit in cache, and we add one more to be safer. + r.o.perRound = r.o.perRound / divide + // Align to 64 bytes. + r.o.perRound = ((r.o.perRound + 63) / 64) * 64 + + // Final sanity check... + if r.o.perRound < 1<<10 { + r.o.perRound = 1 << 10 + } + + if r.o.minSplitSize <= 0 { + // Set minsplit as high as we can, but still have parity in L1. + cacheSize := cpuid.CPU.Cache.L1D + if cacheSize <= 0 { + cacheSize = 32 << 10 + } + + r.o.minSplitSize = cacheSize / (parityShards + 1) + // Min 1K + if r.o.minSplitSize < 1024 { + r.o.minSplitSize = 1024 + } + } + + if r.o.shardSize > 0 { + p := runtime.GOMAXPROCS(0) + if p == 1 || r.o.shardSize <= r.o.minSplitSize*2 { + // Not worth it. + r.o.maxGoroutines = 1 + } else { + g := r.o.shardSize / r.o.perRound + + // Overprovision by a factor of 2. + if g < p*2 && r.o.perRound > r.o.minSplitSize*2 { + g = p * 2 + r.o.perRound /= 2 + } + + // Have g be multiple of p + g += p - 1 + g -= g % p + + r.o.maxGoroutines = g + } + } + + // Generated AVX2 does not need data to stay in L1 cache between runs. + // We will be purely limited by RAM speed. + if r.canAVX2C(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines { + r.o.maxGoroutines = avx2CodeGenMaxGoroutines + } + + if r.canGFNI(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > gfniCodeGenMaxGoroutines { + r.o.maxGoroutines = gfniCodeGenMaxGoroutines + } + + // Inverted matrices are cached in a tree keyed by the indices + // of the invalid rows of the data to reconstruct. + // The inversion root node will have the identity matrix as + // its inversion matrix because it implies there are no errors + // with the original data. + if r.o.inversionCache { + r.tree = newInversionTree(dataShards, parityShards) + } + + r.parity = make([][]byte, parityShards) + for i := range r.parity { + r.parity[i] = r.m[dataShards+i] + } + + if avx2CodeGen && r.o.useAVX2 { + sz := r.dataShards * r.parityShards * 2 * 32 + r.mPool.New = func() interface{} { + return AllocAligned(1, sz)[0] + } + r.mPoolSz = sz + } + return &r, err +} + +func (r *reedSolomon) getTmpSlice() []byte { + return r.mPool.Get().([]byte) +} + +func (r *reedSolomon) putTmpSlice(b []byte) { + if b != nil && cap(b) >= r.mPoolSz { + r.mPool.Put(b[:r.mPoolSz]) + return + } + if false { + // Sanity check + panic(fmt.Sprintf("got short tmp returned, want %d, got %d", r.mPoolSz, cap(b))) + } +} + +// ErrTooFewShards is returned if too few shards where given to +// Encode/Verify/Reconstruct/Update. It will also be returned from Reconstruct +// if there were too few shards to reconstruct the missing data. +var ErrTooFewShards = errors.New("too few shards given") + +// Encode parity for a set of data shards. +// An array 'shards' containing data shards followed by parity shards. +// The number of shards must match the number given to New. +// Each shard is a byte array, and they must all be the same size. +// The parity shards will always be overwritten and the data shards +// will remain the same. +func (r *reedSolomon) Encode(shards [][]byte) error { + if len(shards) != r.totalShards { + return ErrTooFewShards + } + + err := checkShards(shards, false) + if err != nil { + return err + } + + // Get the slice of output buffers. + output := shards[r.dataShards:] + + // Do the coding. + r.codeSomeShards(r.parity, shards[0:r.dataShards], output[:r.parityShards], len(shards[0])) + return nil +} + +// EncodeIdx will add parity for a single data shard. +// Parity shards should start out zeroed. The caller must zero them before first call. +// Data shards should only be delivered once. There is no check for this. +// The parity shards will always be updated and the data shards will remain the unchanged. +func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) error { + if len(parity) != r.parityShards { + return ErrTooFewShards + } + if len(parity) == 0 { + return nil + } + if idx < 0 || idx >= r.dataShards { + return ErrInvShardNum + } + err := checkShards(parity, false) + if err != nil { + return err + } + if len(parity[0]) != len(dataShard) { + return ErrShardSize + } + + if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && ((pshufb && r.o.useAVX2) || r.o.useAvx512GFNI || r.o.useAvxGNFI) { + m := make([][]byte, r.parityShards) + for iRow := range m { + m[iRow] = r.parity[iRow][idx : idx+1] + } + if r.o.useAvx512GFNI || r.o.useAvxGNFI { + r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false) + } else { + r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false) + } + return nil + } + + // Process using no goroutines for now. + start, end := 0, r.o.perRound + if end > len(dataShard) { + end = len(dataShard) + } + + for start < len(dataShard) { + in := dataShard[start:end] + for iRow := 0; iRow < r.parityShards; iRow++ { + galMulSliceXor(r.parity[iRow][idx], in, parity[iRow][start:end], &r.o) + } + start = end + end += r.o.perRound + if end > len(dataShard) { + end = len(dataShard) + } + } + return nil +} + +// ErrInvalidInput is returned if invalid input parameter of Update. +var ErrInvalidInput = errors.New("invalid input") + +func (r *reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error { + if len(shards) != r.totalShards { + return ErrTooFewShards + } + + if len(newDatashards) != r.dataShards { + return ErrTooFewShards + } + + err := checkShards(shards, true) + if err != nil { + return err + } + + err = checkShards(newDatashards, true) + if err != nil { + return err + } + + for i := range newDatashards { + if newDatashards[i] != nil && shards[i] == nil { + return ErrInvalidInput + } + } + for _, p := range shards[r.dataShards:] { + if p == nil { + return ErrInvalidInput + } + } + + shardSize := shardSize(shards) + + // Get the slice of output buffers. + output := shards[r.dataShards:] + + // Do the coding. + r.updateParityShards(r.parity, shards[0:r.dataShards], newDatashards[0:r.dataShards], output, r.parityShards, shardSize) + return nil +} + +func (r *reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) { + if len(outputs) == 0 { + return + } + + if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize { + r.updateParityShardsP(matrixRows, oldinputs, newinputs, outputs, outputCount, byteCount) + return + } + + for c := 0; c < r.dataShards; c++ { + in := newinputs[c] + if in == nil { + continue + } + oldin := oldinputs[c] + // oldinputs data will be changed + sliceXor(in, oldin, &r.o) + for iRow := 0; iRow < outputCount; iRow++ { + galMulSliceXor(matrixRows[iRow][c], oldin, outputs[iRow], &r.o) + } + } +} + +func (r *reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) { + var wg sync.WaitGroup + do := byteCount / r.o.maxGoroutines + if do < r.o.minSplitSize { + do = r.o.minSplitSize + } + start := 0 + for start < byteCount { + if start+do > byteCount { + do = byteCount - start + } + wg.Add(1) + go func(start, stop int) { + for c := 0; c < r.dataShards; c++ { + in := newinputs[c] + if in == nil { + continue + } + oldin := oldinputs[c] + // oldinputs data will be change + sliceXor(in[start:stop], oldin[start:stop], &r.o) + for iRow := 0; iRow < outputCount; iRow++ { + galMulSliceXor(matrixRows[iRow][c], oldin[start:stop], outputs[iRow][start:stop], &r.o) + } + } + wg.Done() + }(start, start+do) + start += do + } + wg.Wait() +} + +// Verify returns true if the parity shards contain the right data. +// The data is the same format as Encode. No data is modified. +func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { + if len(shards) != r.totalShards { + return false, ErrTooFewShards + } + err := checkShards(shards, false) + if err != nil { + return false, err + } + + // Slice of buffers being checked. + toCheck := shards[r.dataShards:] + + // Do the checking. + return r.checkSomeShards(r.parity, shards[:r.dataShards], toCheck[:r.parityShards], len(shards[0])), nil +} + +func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { + return avx2CodeGen && pshufb && r.o.useAVX2 && + byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && + inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs +} + +func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) bool { + return avx2CodeGen && (r.o.useAvx512GFNI || r.o.useAvxGNFI) && + byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && + inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs +} + +// Multiplies a subset of rows from a coding matrix by a full set of +// input totalShards to produce some output totalShards. +// 'matrixRows' is The rows from the matrix to use. +// 'inputs' An array of byte arrays, each of which is one input shard. +// The number of inputs used is determined by the length of each matrix row. +// outputs Byte arrays where the computed totalShards are stored. +// The number of outputs computed, and the +// number of matrix rows used, is determined by +// outputCount, which is the number of outputs to compute. +func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteCount int) { + if len(outputs) == 0 { + return + } + if byteCount > r.o.minSplitSize { + r.codeSomeShardsP(matrixRows, inputs, outputs, byteCount) + return + } + + // Process using no goroutines + start, end := 0, r.o.perRound + if end > len(inputs[0]) { + end = len(inputs[0]) + } + if r.canGFNI(byteCount, len(inputs), len(outputs)) { + var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 + m := genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), gfni[:]) + if r.o.useAvx512GFNI { + start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount) + } else { + start += galMulSlicesAvxGFNI(m, inputs, outputs, 0, byteCount) + } + end = len(inputs[0]) + } else if r.canAVX2C(byteCount, len(inputs), len(outputs)) { + m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) + start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount) + r.putTmpSlice(m) + end = len(inputs[0]) + } else if len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount, maxAvx2Inputs, maxAvx2Outputs) { + var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 + end = len(inputs[0]) + inIdx := 0 + m := r.getTmpSlice() + defer r.putTmpSlice(m) + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] + } + if r.o.useAvx512GFNI { + m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) + if inIdx == 0 { + start = galMulSlicesGFNI(m, inPer, outPer, 0, byteCount) + } else { + start = galMulSlicesGFNIXor(m, inPer, outPer, 0, byteCount) + } + } else if r.o.useAvxGNFI { + m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) + if inIdx == 0 { + start = galMulSlicesAvxGFNI(m, inPer, outPer, 0, byteCount) + } else { + start = galMulSlicesAvxGFNIXor(m, inPer, outPer, 0, byteCount) + } + } else { + m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) + if inIdx == 0 { + start = galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) + } else { + start = galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) + } + } + outIdx += len(outPer) + outs = outs[len(outPer):] + } + inIdx += len(inPer) + ins = ins[len(inPer):] + } + if start >= end { + return + } + } + for start < len(inputs[0]) { + for c := 0; c < len(inputs); c++ { + in := inputs[c][start:end] + for iRow := 0; iRow < len(outputs); iRow++ { + if c == 0 { + galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:end], &r.o) + } else { + galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:end], &r.o) + } + } + } + start = end + end += r.o.perRound + if end > len(inputs[0]) { + end = len(inputs[0]) + } + } +} + +// Perform the same as codeSomeShards, but split the workload into +// several goroutines. +func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byteCount int) { + var wg sync.WaitGroup + gor := r.o.maxGoroutines + + var avx2Matrix []byte + var gfniMatrix []uint64 + useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs)) + useGFNI := r.canGFNI(byteCount, len(inputs), len(outputs)) + if useGFNI { + var tmp [maxAvx2Inputs * maxAvx2Outputs]uint64 + gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:]) + } else if useAvx2 { + avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) + defer r.putTmpSlice(avx2Matrix) + } else if (r.o.useAvx512GFNI || r.o.useAvxGNFI) && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && + r.canGFNI(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + // It appears there is a switchover point at around 10MB where + // Regular processing is faster... + r.codeSomeShardsGFNI(matrixRows, inputs, outputs, byteCount, true) + return + } else if r.o.useAVX2 && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && + r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + // It appears there is a switchover point at around 10MB where + // Regular processing is faster... + r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount, true) + return + } + + do := byteCount / gor + if do < r.o.minSplitSize { + do = r.o.minSplitSize + } + + exec := func(start, stop int) { + if stop-start >= 64 { + if useGFNI { + if r.o.useAvx512GFNI { + start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop) + } else { + start += galMulSlicesAvxGFNI(gfniMatrix, inputs, outputs, start, stop) + } + } else if useAvx2 { + start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) + } + } + + lstart, lstop := start, start+r.o.perRound + if lstop > stop { + lstop = stop + } + for lstart < stop { + for c := 0; c < len(inputs); c++ { + in := inputs[c][lstart:lstop] + for iRow := 0; iRow < len(outputs); iRow++ { + if c == 0 { + galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } else { + galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } + } + } + lstart = lstop + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + } + wg.Done() + } + if gor <= 1 { + wg.Add(1) + exec(0, byteCount) + return + } + + // Make sizes divisible by 64 + do = (do + 63) & (^63) + start := 0 + for start < byteCount { + if start+do > byteCount { + do = byteCount - start + } + + wg.Add(1) + go exec(start, start+do) + start += do + } + wg.Wait() +} + +// Perform the same as codeSomeShards, but split the workload into +// several goroutines. +// If clear is set, the first write will overwrite the output. +func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) { + var wg sync.WaitGroup + gor := r.o.maxGoroutines + + type state struct { + input [][]byte + output [][]byte + m []byte + first bool + } + // Make a plan... + plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs)) + + tmp := r.getTmpSlice() + defer r.putTmpSlice(tmp) + + // Flips between input first to output first. + // We put the smallest data load in the inner loop. + if len(inputs) > len(outputs) { + inIdx := 0 + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] + } + // Generate local matrix + m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + tmp = tmp[len(m):] + plan = append(plan, state{ + input: inPer, + output: outPer, + m: m, + first: inIdx == 0 && clear, + }) + outIdx += len(outPer) + outs = outs[len(outPer):] + } + inIdx += len(inPer) + ins = ins[len(inPer):] + } + } else { + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] + } + + inIdx := 0 + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + // Generate local matrix + m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + tmp = tmp[len(m):] + //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound) + plan = append(plan, state{ + input: inPer, + output: outPer, + m: m, + first: inIdx == 0 && clear, + }) + inIdx += len(inPer) + ins = ins[len(inPer):] + } + outIdx += len(outPer) + outs = outs[len(outPer):] + } + } + + do := byteCount / gor + if do < r.o.minSplitSize { + do = r.o.minSplitSize + } + + exec := func(start, stop int) { + defer wg.Done() + lstart, lstop := start, start+r.o.perRound + if lstop > stop { + lstop = stop + } + for lstart < stop { + if lstop-lstart >= minAvx2Size { + // Execute plan... + var n int + for _, p := range plan { + if p.first { + n = galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop) + } else { + n = galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop) + } + } + lstart += n + if lstart == lstop { + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + continue + } + } + + for c := range inputs { + in := inputs[c][lstart:lstop] + for iRow := 0; iRow < len(outputs); iRow++ { + if c == 0 && clear { + galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } else { + galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } + } + } + lstart = lstop + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + } + } + if gor == 1 { + wg.Add(1) + exec(0, byteCount) + return + } + + // Make sizes divisible by 64 + do = (do + 63) & (^63) + start := 0 + for start < byteCount { + if start+do > byteCount { + do = byteCount - start + } + + wg.Add(1) + go exec(start, start+do) + start += do + } + wg.Wait() +} + +// Perform the same as codeSomeShards, but split the workload into +// several goroutines. +// If clear is set, the first write will overwrite the output. +func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, byteCount int, clear bool) { + var wg sync.WaitGroup + gor := r.o.maxGoroutines + + type state struct { + input [][]byte + output [][]byte + m []uint64 + first bool + } + // Make a plan... + plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs)) + + // Flips between input first to output first. + // We put the smallest data load in the inner loop. + if len(inputs) > len(outputs) { + inIdx := 0 + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] + } + // Generate local matrix + m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer))) + plan = append(plan, state{ + input: inPer, + output: outPer, + m: m, + first: inIdx == 0 && clear, + }) + outIdx += len(outPer) + outs = outs[len(outPer):] + } + inIdx += len(inPer) + ins = ins[len(inPer):] + } + } else { + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] + } + + inIdx := 0 + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + // Generate local matrix + m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), make([]uint64, len(inPer)*len(outPer))) + //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound) + plan = append(plan, state{ + input: inPer, + output: outPer, + m: m, + first: inIdx == 0 && clear, + }) + inIdx += len(inPer) + ins = ins[len(inPer):] + } + outIdx += len(outPer) + outs = outs[len(outPer):] + } + } + + do := byteCount / gor + if do < r.o.minSplitSize { + do = r.o.minSplitSize + } + + exec := func(start, stop int) { + defer wg.Done() + lstart, lstop := start, start+r.o.perRound + if lstop > stop { + lstop = stop + } + for lstart < stop { + if lstop-lstart >= minAvx2Size { + // Execute plan... + var n int + if r.o.useAvx512GFNI { + for _, p := range plan { + if p.first { + n = galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop) + } else { + n = galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop) + } + } + } else { + for _, p := range plan { + if p.first { + n = galMulSlicesAvxGFNI(p.m, p.input, p.output, lstart, lstop) + } else { + n = galMulSlicesAvxGFNIXor(p.m, p.input, p.output, lstart, lstop) + } + } + } + lstart += n + if lstart == lstop { + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + continue + } + } + + for c := range inputs { + in := inputs[c][lstart:lstop] + for iRow := 0; iRow < len(outputs); iRow++ { + if c == 0 && clear { + galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } else { + galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } + } + } + lstart = lstop + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + } + } + + if gor == 1 { + wg.Add(1) + exec(0, byteCount) + return + } + + // Make sizes divisible by 64 + do = (do + 63) & (^63) + start := 0 + for start < byteCount { + if start+do > byteCount { + do = byteCount - start + } + + wg.Add(1) + go exec(start, start+do) + start += do + } + wg.Wait() +} + +// checkSomeShards is mostly the same as codeSomeShards, +// except this will check values and return +// as soon as a difference is found. +func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, byteCount int) bool { + if len(toCheck) == 0 { + return true + } + + outputs := AllocAligned(len(toCheck), byteCount) + r.codeSomeShards(matrixRows, inputs, outputs, byteCount) + + for i, calc := range outputs { + if !bytes.Equal(calc, toCheck[i]) { + return false + } + } + return true +} + +// ErrShardNoData will be returned if there are no shards, +// or if the length of all shards is zero. +var ErrShardNoData = errors.New("no shard data") + +// ErrShardSize is returned if shard length isn't the same for all +// shards. +var ErrShardSize = errors.New("shard sizes do not match") + +// ErrInvalidShardSize is returned if shard length doesn't meet the requirements, +// typically a multiple of N. +var ErrInvalidShardSize = errors.New("invalid shard size") + +// checkShards will check if shards are the same size +// or 0, if allowed. An error is returned if this fails. +// An error is also returned if all shards are size 0. +func checkShards(shards [][]byte, nilok bool) error { + size := shardSize(shards) + if size == 0 { + return ErrShardNoData + } + for _, shard := range shards { + if len(shard) != size { + if len(shard) != 0 || !nilok { + return ErrShardSize + } + } + } + return nil +} + +// shardSize return the size of a single shard. +// The first non-zero size is returned, +// or 0 if all shards are size 0. +func shardSize(shards [][]byte) int { + for _, shard := range shards { + if len(shard) != 0 { + return len(shard) + } + } + return 0 +} + +// Reconstruct will recreate the missing shards, if possible. +// +// Given a list of shards, some of which contain data, fills in the +// ones that don't have data. +// +// The length of the array must be equal to shards. +// You indicate that a shard is missing by setting it to nil or zero-length. +// If a shard is zero-length but has sufficient capacity, that memory will +// be used, otherwise a new []byte will be allocated. +// +// If there are too few shards to reconstruct the missing +// ones, ErrTooFewShards will be returned. +// +// The reconstructed shard set is complete, but integrity is not verified. +// Use the Verify function to check if data set is ok. +func (r *reedSolomon) Reconstruct(shards [][]byte) error { + return r.reconstruct(shards, false, nil) +} + +// ReconstructData will recreate any missing data shards, if possible. +// +// Given a list of shards, some of which contain data, fills in the +// data shards that don't have data. +// +// The length of the array must be equal to shards. +// You indicate that a shard is missing by setting it to nil or zero-length. +// If a shard is zero-length but has sufficient capacity, that memory will +// be used, otherwise a new []byte will be allocated. +// +// If there are too few shards to reconstruct the missing +// ones, ErrTooFewShards will be returned. +// +// As the reconstructed shard set may contain missing parity shards, +// calling the Verify function is likely to fail. +func (r *reedSolomon) ReconstructData(shards [][]byte) error { + return r.reconstruct(shards, true, nil) +} + +// ReconstructSome will recreate only requested shards, if possible. +// +// Given a list of shards, some of which contain data, fills in the +// shards indicated by true values in the "required" parameter. +// The length of the "required" array must be equal to either Shards or DataShards. +// If the length is equal to DataShards, the reconstruction of parity shards will be ignored. +// +// The length of "shards" array must be equal to Shards. +// You indicate that a shard is missing by setting it to nil or zero-length. +// If a shard is zero-length but has sufficient capacity, that memory will +// be used, otherwise a new []byte will be allocated. +// +// If there are too few shards to reconstruct the missing +// ones, ErrTooFewShards will be returned. +// +// As the reconstructed shard set may contain missing parity shards, +// calling the Verify function is likely to fail. +func (r *reedSolomon) ReconstructSome(shards [][]byte, required []bool) error { + if len(required) == r.totalShards { + return r.reconstruct(shards, false, required) + } + return r.reconstruct(shards, true, required) +} + +// reconstruct will recreate the missing data totalShards, and unless +// dataOnly is true, also the missing parity totalShards +// +// The length of "shards" array must be equal to totalShards. +// You indicate that a shard is missing by setting it to nil. +// +// If there are too few totalShards to reconstruct the missing +// ones, ErrTooFewShards will be returned. +func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool, required []bool) error { + if len(shards) != r.totalShards || required != nil && len(required) < r.dataShards { + return ErrTooFewShards + } + // Check arguments. + err := checkShards(shards, true) + if err != nil { + return err + } + + shardSize := shardSize(shards) + + // Quick check: are all of the shards present? If so, there's + // nothing to do. + numberPresent := 0 + dataPresent := 0 + missingRequired := 0 + for i := 0; i < r.totalShards; i++ { + if len(shards[i]) != 0 { + numberPresent++ + if i < r.dataShards { + dataPresent++ + } + } else if required != nil && required[i] { + missingRequired++ + } + } + if numberPresent == r.totalShards || dataOnly && dataPresent == r.dataShards || + required != nil && missingRequired == 0 { + // Cool. All of the shards have data. We don't + // need to do anything. + return nil + } + + // More complete sanity check + if numberPresent < r.dataShards { + return ErrTooFewShards + } + + // Pull out an array holding just the shards that + // correspond to the rows of the submatrix. These shards + // will be the input to the decoding process that re-creates + // the missing data shards. + // + // Also, create an array of indices of the valid rows we do have + // and the invalid rows we don't have up until we have enough valid rows. + subShards := make([][]byte, r.dataShards) + validIndices := make([]int, r.dataShards) + invalidIndices := make([]int, 0) + subMatrixRow := 0 + for matrixRow := 0; matrixRow < r.totalShards && subMatrixRow < r.dataShards; matrixRow++ { + if len(shards[matrixRow]) != 0 { + subShards[subMatrixRow] = shards[matrixRow] + validIndices[subMatrixRow] = matrixRow + subMatrixRow++ + } else { + invalidIndices = append(invalidIndices, matrixRow) + } + } + + // Attempt to get the cached inverted matrix out of the tree + // based on the indices of the invalid rows. + dataDecodeMatrix := r.tree.GetInvertedMatrix(invalidIndices) + + // If the inverted matrix isn't cached in the tree yet we must + // construct it ourselves and insert it into the tree for the + // future. In this way the inversion tree is lazily loaded. + if dataDecodeMatrix == nil { + // Pull out the rows of the matrix that correspond to the + // shards that we have and build a square matrix. This + // matrix could be used to generate the shards that we have + // from the original data. + subMatrix, _ := newMatrix(r.dataShards, r.dataShards) + for subMatrixRow, validIndex := range validIndices { + for c := 0; c < r.dataShards; c++ { + subMatrix[subMatrixRow][c] = r.m[validIndex][c] + } + } + // Invert the matrix, so we can go from the encoded shards + // back to the original data. Then pull out the row that + // generates the shard that we want to decode. Note that + // since this matrix maps back to the original data, it can + // be used to create a data shard, but not a parity shard. + dataDecodeMatrix, err = subMatrix.Invert() + if err != nil { + return err + } + + // Cache the inverted matrix in the tree for future use keyed on the + // indices of the invalid rows. + err = r.tree.InsertInvertedMatrix(invalidIndices, dataDecodeMatrix, r.totalShards) + if err != nil { + return err + } + } + + // Re-create any data shards that were missing. + // + // The input to the coding is all of the shards we actually + // have, and the output is the missing data shards. The computation + // is done using the special decode matrix we just built. + outputs := make([][]byte, r.parityShards) + matrixRows := make([][]byte, r.parityShards) + outputCount := 0 + + for iShard := 0; iShard < r.dataShards; iShard++ { + if len(shards[iShard]) == 0 && (required == nil || required[iShard]) { + if cap(shards[iShard]) >= shardSize { + shards[iShard] = shards[iShard][0:shardSize] + } else { + shards[iShard] = AllocAligned(1, shardSize)[0] + } + outputs[outputCount] = shards[iShard] + matrixRows[outputCount] = dataDecodeMatrix[iShard] + outputCount++ + } + } + r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], shardSize) + + if dataOnly { + // Exit out early if we are only interested in the data shards + return nil + } + + // Now that we have all of the data shards intact, we can + // compute any of the parity that is missing. + // + // The input to the coding is ALL of the data shards, including + // any that we just calculated. The output is whichever of the + // data shards were missing. + outputCount = 0 + for iShard := r.dataShards; iShard < r.totalShards; iShard++ { + if len(shards[iShard]) == 0 && (required == nil || required[iShard]) { + if cap(shards[iShard]) >= shardSize { + shards[iShard] = shards[iShard][0:shardSize] + } else { + shards[iShard] = AllocAligned(1, shardSize)[0] + } + outputs[outputCount] = shards[iShard] + matrixRows[outputCount] = r.parity[iShard-r.dataShards] + outputCount++ + } + } + r.codeSomeShards(matrixRows, shards[:r.dataShards], outputs[:outputCount], shardSize) + return nil +} + +// ErrShortData will be returned by Split(), if there isn't enough data +// to fill the number of shards. +var ErrShortData = errors.New("not enough data to fill the number of requested shards") + +// Split a data slice into the number of shards given to the encoder, +// and create empty parity shards if necessary. +// +// The data will be split into equally sized shards. +// If the data size isn't divisible by the number of shards, +// the last shard will contain extra zeros. +// +// If there is extra capacity on the provided data slice +// it will be used instead of allocating parity shards. +// It will be zeroed out. +// +// There must be at least 1 byte otherwise ErrShortData will be +// returned. +// +// The data will not be copied, except for the last shard, so you +// should not modify the data of the input slice afterwards. +func (r *reedSolomon) Split(data []byte) ([][]byte, error) { + if len(data) == 0 { + return nil, ErrShortData + } + if r.totalShards == 1 { + return [][]byte{data}, nil + } + + dataLen := len(data) + // Calculate number of bytes per data shard. + perShard := (len(data) + r.dataShards - 1) / r.dataShards + needTotal := r.totalShards * perShard + + if cap(data) > len(data) { + if cap(data) > needTotal { + data = data[:needTotal] + } else { + data = data[:cap(data)] + } + clear := data[dataLen:] + for i := range clear { + clear[i] = 0 + } + } + + // Only allocate memory if necessary + var padding [][]byte + if len(data) < needTotal { + // calculate maximum number of full shards in `data` slice + fullShards := len(data) / perShard + padding = AllocAligned(r.totalShards-fullShards, perShard) + + if dataLen > perShard*fullShards { + // Copy partial shards + copyFrom := data[perShard*fullShards : dataLen] + for i := range padding { + if len(copyFrom) == 0 { + break + } + copyFrom = copyFrom[copy(padding[i], copyFrom):] + } + } + } + + // Split into equal-length shards. + dst := make([][]byte, r.totalShards) + i := 0 + for ; i < len(dst) && len(data) >= perShard; i++ { + dst[i] = data[:perShard:perShard] + data = data[perShard:] + } + + for j := 0; i+j < len(dst); j++ { + dst[i+j] = padding[0] + padding = padding[1:] + } + + return dst, nil +} + +// ErrReconstructRequired is returned if too few data shards are intact and a +// reconstruction is required before you can successfully join the shards. +var ErrReconstructRequired = errors.New("reconstruction required as one or more required data shards are nil") + +// Join the shards and write the data segment to dst. +// +// Only the data shards are considered. +// You must supply the exact output size you want. +// +// If there are to few shards given, ErrTooFewShards will be returned. +// If the total data size is less than outSize, ErrShortData will be returned. +// If one or more required data shards are nil, ErrReconstructRequired will be returned. +func (r *reedSolomon) Join(dst io.Writer, shards [][]byte, outSize int) error { + // Do we have enough shards? + if len(shards) < r.dataShards { + return ErrTooFewShards + } + shards = shards[:r.dataShards] + + // Do we have enough data? + size := 0 + for _, shard := range shards { + if shard == nil { + return ErrReconstructRequired + } + size += len(shard) + + // Do we have enough data already? + if size >= outSize { + break + } + } + if size < outSize { + return ErrShortData + } + + // Copy data to dst + write := outSize + for _, shard := range shards { + if write < len(shard) { + _, err := dst.Write(shard[:write]) + return err + } + n, err := dst.Write(shard) + if err != nil { + return err + } + write -= n + } + return nil +} diff --git a/vendor/github.com/klauspost/reedsolomon/streaming.go b/vendor/github.com/klauspost/reedsolomon/streaming.go new file mode 100644 index 000000000..f7aba3b89 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/streaming.go @@ -0,0 +1,614 @@ +/** + * Reed-Solomon Coding over 8-bit values. + * + * Copyright 2015, Klaus Post + * Copyright 2015, Backblaze, Inc. + */ + +package reedsolomon + +import ( + "errors" + "fmt" + "io" + "sync" +) + +// StreamEncoder is an interface to encode Reed-Salomon parity sets for your data. +// It provides a fully streaming interface, and processes data in blocks of up to 4MB. +// +// For small shard sizes, 10MB and below, it is recommended to use the in-memory interface, +// since the streaming interface has a start up overhead. +// +// For all operations, no readers and writers should not assume any order/size of +// individual reads/writes. +// +// For usage examples, see "stream-encoder.go" and "streamdecoder.go" in the examples +// folder. +type StreamEncoder interface { + // Encode parity shards for a set of data shards. + // + // Input is 'shards' containing readers for data shards followed by parity shards + // io.Writer. + // + // The number of shards must match the number given to NewStream(). + // + // Each reader must supply the same number of bytes. + // + // The parity shards will be written to the writer. + // The number of bytes written will match the input size. + // + // If a data stream returns an error, a StreamReadError type error + // will be returned. If a parity writer returns an error, a + // StreamWriteError will be returned. + Encode(data []io.Reader, parity []io.Writer) error + + // Verify returns true if the parity shards contain correct data. + // + // The number of shards must match the number total data+parity shards + // given to NewStream(). + // + // Each reader must supply the same number of bytes. + // If a shard stream returns an error, a StreamReadError type error + // will be returned. + Verify(shards []io.Reader) (bool, error) + + // Reconstruct will recreate the missing shards if possible. + // + // Given a list of valid shards (to read) and invalid shards (to write) + // + // You indicate that a shard is missing by setting it to nil in the 'valid' + // slice and at the same time setting a non-nil writer in "fill". + // An index cannot contain both non-nil 'valid' and 'fill' entry. + // If both are provided 'ErrReconstructMismatch' is returned. + // + // If there are too few shards to reconstruct the missing + // ones, ErrTooFewShards will be returned. + // + // The reconstructed shard set is complete, but integrity is not verified. + // Use the Verify function to check if data set is ok. + Reconstruct(valid []io.Reader, fill []io.Writer) error + + // Split a an input stream into the number of shards given to the encoder. + // + // The data will be split into equally sized shards. + // If the data size isn't dividable by the number of shards, + // the last shard will contain extra zeros. + // + // You must supply the total size of your input. + // 'ErrShortData' will be returned if it is unable to retrieve the + // number of bytes indicated. + Split(data io.Reader, dst []io.Writer, size int64) (err error) + + // Join the shards and write the data segment to dst. + // + // Only the data shards are considered. + // + // You must supply the exact output size you want. + // If there are to few shards given, ErrTooFewShards will be returned. + // If the total data size is less than outSize, ErrShortData will be returned. + Join(dst io.Writer, shards []io.Reader, outSize int64) error +} + +// StreamReadError is returned when a read error is encountered +// that relates to a supplied stream. +// This will allow you to find out which reader has failed. +type StreamReadError struct { + Err error // The error + Stream int // The stream number on which the error occurred +} + +// Error returns the error as a string +func (s StreamReadError) Error() string { + return fmt.Sprintf("error reading stream %d: %s", s.Stream, s.Err) +} + +// String returns the error as a string +func (s StreamReadError) String() string { + return s.Error() +} + +// StreamWriteError is returned when a write error is encountered +// that relates to a supplied stream. This will allow you to +// find out which reader has failed. +type StreamWriteError struct { + Err error // The error + Stream int // The stream number on which the error occurred +} + +// Error returns the error as a string +func (s StreamWriteError) Error() string { + return fmt.Sprintf("error writing stream %d: %s", s.Stream, s.Err) +} + +// String returns the error as a string +func (s StreamWriteError) String() string { + return s.Error() +} + +// rsStream contains a matrix for a specific +// distribution of datashards and parity shards. +// Construct if using NewStream() +type rsStream struct { + r *reedSolomon + o options + + // Shard reader + readShards func(dst [][]byte, in []io.Reader) error + // Shard writer + writeShards func(out []io.Writer, in [][]byte) error + + blockPool sync.Pool +} + +// NewStream creates a new encoder and initializes it to +// the number of data shards and parity shards that +// you want to use. You can reuse this encoder. +// Note that the maximum number of data shards is 256. +func NewStream(dataShards, parityShards int, o ...Option) (StreamEncoder, error) { + if dataShards+parityShards > 256 { + return nil, ErrMaxShardNum + } + + r := rsStream{o: defaultOptions} + for _, opt := range o { + opt(&r.o) + } + // Override block size if shard size is set. + if r.o.streamBS == 0 && r.o.shardSize > 0 { + r.o.streamBS = r.o.shardSize + } + if r.o.streamBS <= 0 { + r.o.streamBS = 4 << 20 + } + if r.o.shardSize == 0 && r.o.maxGoroutines == defaultOptions.maxGoroutines { + o = append(o, WithAutoGoroutines(r.o.streamBS)) + } + + enc, err := New(dataShards, parityShards, o...) + if err != nil { + return nil, err + } + r.r = enc.(*reedSolomon) + + r.blockPool.New = func() interface{} { + return AllocAligned(dataShards+parityShards, r.o.streamBS) + } + r.readShards = readShards + r.writeShards = writeShards + if r.o.concReads { + r.readShards = cReadShards + } + if r.o.concWrites { + r.writeShards = cWriteShards + } + + return &r, err +} + +// NewStreamC creates a new encoder and initializes it to +// the number of data shards and parity shards given. +// +// This functions as 'NewStream', but allows you to enable CONCURRENT reads and writes. +func NewStreamC(dataShards, parityShards int, conReads, conWrites bool, o ...Option) (StreamEncoder, error) { + return NewStream(dataShards, parityShards, append(o, WithConcurrentStreamReads(conReads), WithConcurrentStreamWrites(conWrites))...) +} + +func (r *rsStream) createSlice() [][]byte { + out := r.blockPool.Get().([][]byte) + for i := range out { + out[i] = out[i][:r.o.streamBS] + } + return out +} + +// Encodes parity shards for a set of data shards. +// +// Input is 'shards' containing readers for data shards followed by parity shards +// io.Writer. +// +// The number of shards must match the number given to NewStream(). +// +// Each reader must supply the same number of bytes. +// +// The parity shards will be written to the writer. +// The number of bytes written will match the input size. +// +// If a data stream returns an error, a StreamReadError type error +// will be returned. If a parity writer returns an error, a +// StreamWriteError will be returned. +func (r *rsStream) Encode(data []io.Reader, parity []io.Writer) error { + if len(data) != r.r.dataShards { + return ErrTooFewShards + } + + if len(parity) != r.r.parityShards { + return ErrTooFewShards + } + + all := r.createSlice() + defer r.blockPool.Put(all) + in := all[:r.r.dataShards] + out := all[r.r.dataShards:] + read := 0 + + for { + err := r.readShards(in, data) + switch err { + case nil: + case io.EOF: + if read == 0 { + return ErrShardNoData + } + return nil + default: + return err + } + out = trimShards(out, shardSize(in)) + read += shardSize(in) + err = r.r.Encode(all) + if err != nil { + return err + } + err = r.writeShards(parity, out) + if err != nil { + return err + } + } +} + +// Trim the shards so they are all the same size +func trimShards(in [][]byte, size int) [][]byte { + for i := range in { + if len(in[i]) != 0 { + in[i] = in[i][0:size] + } + if len(in[i]) < size { + in[i] = in[i][:0] + } + } + return in +} + +func readShards(dst [][]byte, in []io.Reader) error { + if len(in) != len(dst) { + panic("internal error: in and dst size do not match") + } + size := -1 + for i := range in { + if in[i] == nil { + dst[i] = dst[i][:0] + continue + } + n, err := io.ReadFull(in[i], dst[i]) + // The error is EOF only if no bytes were read. + // If an EOF happens after reading some but not all the bytes, + // ReadFull returns ErrUnexpectedEOF. + switch err { + case io.ErrUnexpectedEOF, io.EOF: + if size < 0 { + size = n + } else if n != size { + // Shard sizes must match. + return ErrShardSize + } + dst[i] = dst[i][0:n] + case nil: + continue + default: + return StreamReadError{Err: err, Stream: i} + } + } + if size == 0 { + return io.EOF + } + return nil +} + +func writeShards(out []io.Writer, in [][]byte) error { + if len(out) != len(in) { + panic("internal error: in and out size do not match") + } + for i := range in { + if out[i] == nil { + continue + } + n, err := out[i].Write(in[i]) + if err != nil { + return StreamWriteError{Err: err, Stream: i} + } + // + if n != len(in[i]) { + return StreamWriteError{Err: io.ErrShortWrite, Stream: i} + } + } + return nil +} + +type readResult struct { + n int + size int + err error +} + +// cReadShards reads shards concurrently +func cReadShards(dst [][]byte, in []io.Reader) error { + if len(in) != len(dst) { + panic("internal error: in and dst size do not match") + } + var wg sync.WaitGroup + wg.Add(len(in)) + res := make(chan readResult, len(in)) + for i := range in { + if in[i] == nil { + dst[i] = dst[i][:0] + wg.Done() + continue + } + go func(i int) { + defer wg.Done() + n, err := io.ReadFull(in[i], dst[i]) + // The error is EOF only if no bytes were read. + // If an EOF happens after reading some but not all the bytes, + // ReadFull returns ErrUnexpectedEOF. + res <- readResult{size: n, err: err, n: i} + + }(i) + } + wg.Wait() + close(res) + size := -1 + for r := range res { + switch r.err { + case io.ErrUnexpectedEOF, io.EOF: + if size < 0 { + size = r.size + } else if r.size != size { + // Shard sizes must match. + return ErrShardSize + } + dst[r.n] = dst[r.n][0:r.size] + case nil: + default: + return StreamReadError{Err: r.err, Stream: r.n} + } + } + if size == 0 { + return io.EOF + } + return nil +} + +// cWriteShards writes shards concurrently +func cWriteShards(out []io.Writer, in [][]byte) error { + if len(out) != len(in) { + panic("internal error: in and out size do not match") + } + var errs = make(chan error, len(out)) + var wg sync.WaitGroup + wg.Add(len(out)) + for i := range in { + go func(i int) { + defer wg.Done() + if out[i] == nil { + errs <- nil + return + } + n, err := out[i].Write(in[i]) + if err != nil { + errs <- StreamWriteError{Err: err, Stream: i} + return + } + if n != len(in[i]) { + errs <- StreamWriteError{Err: io.ErrShortWrite, Stream: i} + } + }(i) + } + wg.Wait() + close(errs) + for err := range errs { + if err != nil { + return err + } + } + + return nil +} + +// Verify returns true if the parity shards contain correct data. +// +// The number of shards must match the number total data+parity shards +// given to NewStream(). +// +// Each reader must supply the same number of bytes. +// If a shard stream returns an error, a StreamReadError type error +// will be returned. +func (r *rsStream) Verify(shards []io.Reader) (bool, error) { + if len(shards) != r.r.totalShards { + return false, ErrTooFewShards + } + + read := 0 + all := r.createSlice() + defer r.blockPool.Put(all) + for { + err := r.readShards(all, shards) + if err == io.EOF { + if read == 0 { + return false, ErrShardNoData + } + return true, nil + } + if err != nil { + return false, err + } + read += shardSize(all) + ok, err := r.r.Verify(all) + if !ok || err != nil { + return ok, err + } + } +} + +// ErrReconstructMismatch is returned by the StreamEncoder, if you supply +// "valid" and "fill" streams on the same index. +// Therefore it is impossible to see if you consider the shard valid +// or would like to have it reconstructed. +var ErrReconstructMismatch = errors.New("valid shards and fill shards are mutually exclusive") + +// Reconstruct will recreate the missing shards if possible. +// +// Given a list of valid shards (to read) and invalid shards (to write) +// +// You indicate that a shard is missing by setting it to nil in the 'valid' +// slice and at the same time setting a non-nil writer in "fill". +// An index cannot contain both non-nil 'valid' and 'fill' entry. +// +// If there are too few shards to reconstruct the missing +// ones, ErrTooFewShards will be returned. +// +// The reconstructed shard set is complete when explicitly asked for all missing shards. +// However its integrity is not automatically verified. +// Use the Verify function to check in case the data set is complete. +func (r *rsStream) Reconstruct(valid []io.Reader, fill []io.Writer) error { + if len(valid) != r.r.totalShards { + return ErrTooFewShards + } + if len(fill) != r.r.totalShards { + return ErrTooFewShards + } + + all := r.createSlice() + defer r.blockPool.Put(all) + reconDataOnly := true + for i := range valid { + if valid[i] != nil && fill[i] != nil { + return ErrReconstructMismatch + } + if i >= r.r.dataShards && fill[i] != nil { + reconDataOnly = false + } + } + + read := 0 + for { + err := r.readShards(all, valid) + if err == io.EOF { + if read == 0 { + return ErrShardNoData + } + return nil + } + if err != nil { + return err + } + read += shardSize(all) + all = trimShards(all, shardSize(all)) + + if reconDataOnly { + err = r.r.ReconstructData(all) // just reconstruct missing data shards + } else { + err = r.r.Reconstruct(all) // reconstruct all missing shards + } + if err != nil { + return err + } + err = r.writeShards(fill, all) + if err != nil { + return err + } + } +} + +// Join the shards and write the data segment to dst. +// +// Only the data shards are considered. +// +// You must supply the exact output size you want. +// If there are to few shards given, ErrTooFewShards will be returned. +// If the total data size is less than outSize, ErrShortData will be returned. +func (r *rsStream) Join(dst io.Writer, shards []io.Reader, outSize int64) error { + // Do we have enough shards? + if len(shards) < r.r.dataShards { + return ErrTooFewShards + } + + // Trim off parity shards if any + shards = shards[:r.r.dataShards] + for i := range shards { + if shards[i] == nil { + return StreamReadError{Err: ErrShardNoData, Stream: i} + } + } + // Join all shards + src := io.MultiReader(shards...) + + // Copy data to dst + n, err := io.CopyN(dst, src, outSize) + if err == io.EOF { + return ErrShortData + } + if err != nil { + return err + } + if n != outSize { + return ErrShortData + } + return nil +} + +// Split a an input stream into the number of shards given to the encoder. +// +// The data will be split into equally sized shards. +// If the data size isn't dividable by the number of shards, +// the last shard will contain extra zeros. +// +// You must supply the total size of your input. +// 'ErrShortData' will be returned if it is unable to retrieve the +// number of bytes indicated. +func (r *rsStream) Split(data io.Reader, dst []io.Writer, size int64) error { + if size == 0 { + return ErrShortData + } + if len(dst) != r.r.dataShards { + return ErrInvShardNum + } + + for i := range dst { + if dst[i] == nil { + return StreamWriteError{Err: ErrShardNoData, Stream: i} + } + } + + // Calculate number of bytes per shard. + perShard := (size + int64(r.r.dataShards) - 1) / int64(r.r.dataShards) + + // Pad data to r.Shards*perShard. + paddingSize := (int64(r.r.totalShards) * perShard) - size + data = io.MultiReader(data, io.LimitReader(zeroPaddingReader{}, paddingSize)) + + // Split into equal-length shards and copy. + for i := range dst { + n, err := io.CopyN(dst[i], data, perShard) + if err != io.EOF && err != nil { + return err + } + if n != perShard { + return ErrShortData + } + } + + return nil +} + +type zeroPaddingReader struct{} + +var _ io.Reader = &zeroPaddingReader{} + +func (t zeroPaddingReader) Read(p []byte) (n int, err error) { + n = len(p) + for i := 0; i < n; i++ { + p[i] = 0 + } + return n, nil +} diff --git a/vendor/github.com/klauspost/reedsolomon/unsafe.go b/vendor/github.com/klauspost/reedsolomon/unsafe.go new file mode 100644 index 000000000..d85892f0f --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/unsafe.go @@ -0,0 +1,41 @@ +//go:build !noasm && !nounsafe && !gccgo && !appengine + +/** + * Reed-Solomon Coding over 8-bit values. + * + * Copyright 2023, Klaus Post + */ + +package reedsolomon + +import ( + "unsafe" +) + +// AllocAligned allocates 'shards' slices, with 'each' bytes. +// Each slice will start on a 64 byte aligned boundary. +func AllocAligned(shards, each int) [][]byte { + if false { + res := make([][]byte, shards) + for i := range res { + res[i] = make([]byte, each) + } + return res + } + const ( + alignEach = 64 + alignStart = 64 + ) + eachAligned := ((each + alignEach - 1) / alignEach) * alignEach + total := make([]byte, eachAligned*shards+63) + align := uint(uintptr(unsafe.Pointer(&total[0]))) & (alignStart - 1) + if align > 0 { + total = total[alignStart-align:] + } + res := make([][]byte, shards) + for i := range res { + res[i] = total[:each:eachAligned] + total = total[eachAligned:] + } + return res +} diff --git a/vendor/github.com/klauspost/reedsolomon/unsafe_disabled.go b/vendor/github.com/klauspost/reedsolomon/unsafe_disabled.go new file mode 100644 index 000000000..95cb8e6eb --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/unsafe_disabled.go @@ -0,0 +1,23 @@ +//go:build noasm || nounsafe || gccgo || appengine + +/** + * Reed-Solomon Coding over 8-bit values. + * + * Copyright 2023, Klaus Post + */ + +package reedsolomon + +// AllocAligned allocates 'shards' slices, with 'each' bytes. +// Each slice will start on a 64 byte aligned boundary. +func AllocAligned(shards, each int) [][]byte { + eachAligned := ((each + 63) / 64) * 64 + total := make([]byte, eachAligned*shards+63) + // We cannot do initial align without "unsafe", just use native alignment. + res := make([][]byte, shards) + for i := range res { + res[i] = total[:each:eachAligned] + total = total[eachAligned:] + } + return res +} diff --git a/vendor/github.com/klauspost/reedsolomon/xor_arm64.go b/vendor/github.com/klauspost/reedsolomon/xor_arm64.go new file mode 100644 index 000000000..6f0522f88 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/xor_arm64.go @@ -0,0 +1,19 @@ +//go:build !noasm && !appengine && !gccgo + +package reedsolomon + +//go:noescape +func xorSliceNEON(in, out []byte) + +// simple slice xor +func sliceXor(in, out []byte, o *options) { + xorSliceNEON(in, out) + done := (len(in) >> 5) << 5 + + remain := len(in) - done + if remain > 0 { + for i := done; i < len(in); i++ { + out[i] ^= in[i] + } + } +} diff --git a/vendor/github.com/klauspost/reedsolomon/xor_arm64.s b/vendor/github.com/klauspost/reedsolomon/xor_arm64.s new file mode 100644 index 000000000..562987316 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/xor_arm64.s @@ -0,0 +1,29 @@ +//+build !noasm +//+build !appengine +//+build !gccgo + +// func xorSliceNEON(in, out []byte) +TEXT ·xorSliceNEON(SB), 7, $0 + MOVD in_base+0(FP), R1 + MOVD in_len+8(FP), R2 // length of message + MOVD out_base+24(FP), R5 + SUBS $32, R2 + BMI completeXor + +loopXor: + // Main loop + VLD1.P 32(R1), [V0.B16, V1.B16] + VLD1 (R5), [V20.B16, V21.B16] + + VEOR V20.B16, V0.B16, V4.B16 + VEOR V21.B16, V1.B16, V5.B16 + + // Store result + VST1.P [V4.D2, V5.D2], 32(R5) + + SUBS $32, R2 + BPL loopXor + +completeXor: + RET + diff --git a/vendor/github.com/klauspost/reedsolomon/xor_noasm.go b/vendor/github.com/klauspost/reedsolomon/xor_noasm.go new file mode 100644 index 000000000..d3e29f90e --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/xor_noasm.go @@ -0,0 +1,7 @@ +//go:build noasm || gccgo || appengine || (!amd64 && !arm64) + +package reedsolomon + +func sliceXor(in, out []byte, o *options) { + sliceXorGo(in, out, o) +} diff --git a/vendor/golang.org/x/sys/cpu/asm_aix_ppc64.s b/vendor/golang.org/x/sys/cpu/asm_aix_ppc64.s index db9171c2e..269e173ca 100644 --- a/vendor/golang.org/x/sys/cpu/asm_aix_ppc64.s +++ b/vendor/golang.org/x/sys/cpu/asm_aix_ppc64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/cpu/cpu.go b/vendor/golang.org/x/sys/cpu/cpu.go index 83f112c4c..4756ad5f7 100644 --- a/vendor/golang.org/x/sys/cpu/cpu.go +++ b/vendor/golang.org/x/sys/cpu/cpu.go @@ -38,7 +38,7 @@ var X86 struct { HasAVX512F bool // Advanced vector extension 512 Foundation Instructions HasAVX512CD bool // Advanced vector extension 512 Conflict Detection Instructions HasAVX512ER bool // Advanced vector extension 512 Exponential and Reciprocal Instructions - HasAVX512PF bool // Advanced vector extension 512 Prefetch Instructions Instructions + HasAVX512PF bool // Advanced vector extension 512 Prefetch Instructions HasAVX512VL bool // Advanced vector extension 512 Vector Length Extensions HasAVX512BW bool // Advanced vector extension 512 Byte and Word Instructions HasAVX512DQ bool // Advanced vector extension 512 Doubleword and Quadword Instructions @@ -54,6 +54,9 @@ var X86 struct { HasAVX512VBMI2 bool // Advanced vector extension 512 Vector Byte Manipulation Instructions 2 HasAVX512BITALG bool // Advanced vector extension 512 Bit Algorithms HasAVX512BF16 bool // Advanced vector extension 512 BFloat16 Instructions + HasAMXTile bool // Advanced Matrix Extension Tile instructions + HasAMXInt8 bool // Advanced Matrix Extension Int8 instructions + HasAMXBF16 bool // Advanced Matrix Extension BFloat16 instructions HasBMI1 bool // Bit manipulation instruction set 1 HasBMI2 bool // Bit manipulation instruction set 2 HasCX16 bool // Compare and exchange 16 Bytes diff --git a/vendor/golang.org/x/sys/cpu/cpu_aix.go b/vendor/golang.org/x/sys/cpu/cpu_aix.go index 8aaeef545..9bf0c32eb 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_aix.go +++ b/vendor/golang.org/x/sys/cpu/cpu_aix.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix -// +build aix package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_arm64.s b/vendor/golang.org/x/sys/cpu/cpu_arm64.s index c61f95a05..fcb9a3888 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_arm64.s +++ b/vendor/golang.org/x/sys/cpu/cpu_arm64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/cpu/cpu_gc_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_gc_arm64.go index ccf542a73..a8acd3e32 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_gc_arm64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_gc_arm64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc -// +build gc package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_gc_s390x.go b/vendor/golang.org/x/sys/cpu/cpu_gc_s390x.go index 0af2f2484..c8ae6ddc1 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_gc_s390x.go +++ b/vendor/golang.org/x/sys/cpu/cpu_gc_s390x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc -// +build gc package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go index fa7cdb9bc..910728fb1 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go +++ b/vendor/golang.org/x/sys/cpu/cpu_gc_x86.go @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (386 || amd64 || amd64p32) && gc -// +build 386 amd64 amd64p32 -// +build gc package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_gccgo_arm64.go index 2aff31891..7f1946780 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_gccgo_arm64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo_arm64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gccgo -// +build gccgo package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo_s390x.go b/vendor/golang.org/x/sys/cpu/cpu_gccgo_s390x.go index 4bfbda619..9526d2ce3 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_gccgo_s390x.go +++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo_s390x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gccgo -// +build gccgo package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c index 6cc73109f..3f73a05dc 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c +++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (386 || amd64 || amd64p32) && gccgo -// +build 386 amd64 amd64p32 -// +build gccgo #include #include diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go index 863d415ab..99c60fe9f 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go +++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.go @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (386 || amd64 || amd64p32) && gccgo -// +build 386 amd64 amd64p32 -// +build gccgo package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux.go b/vendor/golang.org/x/sys/cpu/cpu_linux.go index 159a686f6..743eb5435 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_linux.go +++ b/vendor/golang.org/x/sys/cpu/cpu_linux.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !386 && !amd64 && !amd64p32 && !arm64 -// +build !386,!amd64,!amd64p32,!arm64 package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_mips64x.go b/vendor/golang.org/x/sys/cpu/cpu_linux_mips64x.go index 6000db4cd..4686c1d54 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_linux_mips64x.go +++ b/vendor/golang.org/x/sys/cpu/cpu_linux_mips64x.go @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && (mips64 || mips64le) -// +build linux -// +build mips64 mips64le package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go index f4992b1a5..cd63e7335 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go +++ b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x -// +build linux,!arm,!arm64,!mips64,!mips64le,!ppc64,!ppc64le,!s390x package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_ppc64x.go b/vendor/golang.org/x/sys/cpu/cpu_linux_ppc64x.go index 021356d6d..197188e67 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_linux_ppc64x.go +++ b/vendor/golang.org/x/sys/cpu/cpu_linux_ppc64x.go @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && (ppc64 || ppc64le) -// +build linux -// +build ppc64 ppc64le package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_loong64.go b/vendor/golang.org/x/sys/cpu/cpu_loong64.go index 0f57b05bd..558635850 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_loong64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_loong64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build loong64 -// +build loong64 package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_mips64x.go b/vendor/golang.org/x/sys/cpu/cpu_mips64x.go index f4063c664..fedb00cc4 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_mips64x.go +++ b/vendor/golang.org/x/sys/cpu/cpu_mips64x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build mips64 || mips64le -// +build mips64 mips64le package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_mipsx.go b/vendor/golang.org/x/sys/cpu/cpu_mipsx.go index 07c4e36d8..ffb4ec7eb 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_mipsx.go +++ b/vendor/golang.org/x/sys/cpu/cpu_mipsx.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build mips || mipsle -// +build mips mipsle package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_arm.go b/vendor/golang.org/x/sys/cpu/cpu_other_arm.go index d7b4fb4cc..e9ecf2a45 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_other_arm.go +++ b/vendor/golang.org/x/sys/cpu/cpu_other_arm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !linux && arm -// +build !linux,arm package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go index f3cde129b..5341e7f88 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !linux && !netbsd && !openbsd && arm64 -// +build !linux,!netbsd,!openbsd,arm64 package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_mips64x.go b/vendor/golang.org/x/sys/cpu/cpu_other_mips64x.go index 0dafe9644..5f8f2419a 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_other_mips64x.go +++ b/vendor/golang.org/x/sys/cpu/cpu_other_mips64x.go @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build !linux && (mips64 || mips64le) -// +build !linux -// +build mips64 mips64le package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_ppc64x.go b/vendor/golang.org/x/sys/cpu/cpu_other_ppc64x.go index 060d46b6e..89608fba2 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_other_ppc64x.go +++ b/vendor/golang.org/x/sys/cpu/cpu_other_ppc64x.go @@ -3,9 +3,6 @@ // license that can be found in the LICENSE file. //go:build !aix && !linux && (ppc64 || ppc64le) -// +build !aix -// +build !linux -// +build ppc64 ppc64le package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_other_riscv64.go index dd10eb79f..5ab87808f 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_other_riscv64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_other_riscv64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !linux && riscv64 -// +build !linux,riscv64 package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go b/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go index 4e8acd165..c14f12b14 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go +++ b/vendor/golang.org/x/sys/cpu/cpu_ppc64x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build ppc64 || ppc64le -// +build ppc64 ppc64le package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go index bd6c128af..7f0c79c00 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go @@ -3,10 +3,9 @@ // license that can be found in the LICENSE file. //go:build riscv64 -// +build riscv64 package cpu -const cacheLineSize = 32 +const cacheLineSize = 64 func initOptions() {} diff --git a/vendor/golang.org/x/sys/cpu/cpu_s390x.s b/vendor/golang.org/x/sys/cpu/cpu_s390x.s index 96f81e209..1fb4b7013 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_s390x.s +++ b/vendor/golang.org/x/sys/cpu/cpu_s390x.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/cpu/cpu_wasm.go b/vendor/golang.org/x/sys/cpu/cpu_wasm.go index 7747d888a..384787ea3 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_wasm.go +++ b/vendor/golang.org/x/sys/cpu/cpu_wasm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build wasm -// +build wasm package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_x86.go b/vendor/golang.org/x/sys/cpu/cpu_x86.go index f5aacfc82..c29f5e4c5 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_x86.go +++ b/vendor/golang.org/x/sys/cpu/cpu_x86.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build 386 || amd64 || amd64p32 -// +build 386 amd64 amd64p32 package cpu @@ -37,6 +36,9 @@ func initOptions() { {Name: "avx512vbmi2", Feature: &X86.HasAVX512VBMI2}, {Name: "avx512bitalg", Feature: &X86.HasAVX512BITALG}, {Name: "avx512bf16", Feature: &X86.HasAVX512BF16}, + {Name: "amxtile", Feature: &X86.HasAMXTile}, + {Name: "amxint8", Feature: &X86.HasAMXInt8}, + {Name: "amxbf16", Feature: &X86.HasAMXBF16}, {Name: "bmi1", Feature: &X86.HasBMI1}, {Name: "bmi2", Feature: &X86.HasBMI2}, {Name: "cx16", Feature: &X86.HasCX16}, @@ -138,6 +140,10 @@ func archInit() { eax71, _, _, _ := cpuid(7, 1) X86.HasAVX512BF16 = isSet(5, eax71) } + + X86.HasAMXTile = isSet(24, edx7) + X86.HasAMXInt8 = isSet(25, edx7) + X86.HasAMXBF16 = isSet(22, edx7) } func isSet(bitpos uint, value uint32) bool { diff --git a/vendor/golang.org/x/sys/cpu/cpu_x86.s b/vendor/golang.org/x/sys/cpu/cpu_x86.s index 39acab2ff..7d7ba33ef 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_x86.s +++ b/vendor/golang.org/x/sys/cpu/cpu_x86.s @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (386 || amd64 || amd64p32) && gc -// +build 386 amd64 amd64p32 -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/cpu/endian_big.go b/vendor/golang.org/x/sys/cpu/endian_big.go index 93ce03a34..7fe04b0a1 100644 --- a/vendor/golang.org/x/sys/cpu/endian_big.go +++ b/vendor/golang.org/x/sys/cpu/endian_big.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build armbe || arm64be || m68k || mips || mips64 || mips64p32 || ppc || ppc64 || s390 || s390x || shbe || sparc || sparc64 -// +build armbe arm64be m68k mips mips64 mips64p32 ppc ppc64 s390 s390x shbe sparc sparc64 package cpu diff --git a/vendor/golang.org/x/sys/cpu/endian_little.go b/vendor/golang.org/x/sys/cpu/endian_little.go index 55db853ef..48eccc4c7 100644 --- a/vendor/golang.org/x/sys/cpu/endian_little.go +++ b/vendor/golang.org/x/sys/cpu/endian_little.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build 386 || amd64 || amd64p32 || alpha || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || nios2 || ppc64le || riscv || riscv64 || sh || wasm -// +build 386 amd64 amd64p32 alpha arm arm64 loong64 mipsle mips64le mips64p32le nios2 ppc64le riscv riscv64 sh wasm package cpu diff --git a/vendor/golang.org/x/sys/cpu/hwcap_linux.go b/vendor/golang.org/x/sys/cpu/hwcap_linux.go index 1d9d91f3e..34e49f955 100644 --- a/vendor/golang.org/x/sys/cpu/hwcap_linux.go +++ b/vendor/golang.org/x/sys/cpu/hwcap_linux.go @@ -5,7 +5,7 @@ package cpu import ( - "io/ioutil" + "os" ) const ( @@ -39,7 +39,7 @@ func readHWCAP() error { return nil } - buf, err := ioutil.ReadFile(procAuxv) + buf, err := os.ReadFile(procAuxv) if err != nil { // e.g. on android /proc/self/auxv is not accessible, so silently // ignore the error and leave Initialized = false. On some diff --git a/vendor/golang.org/x/sys/cpu/proc_cpuinfo_linux.go b/vendor/golang.org/x/sys/cpu/proc_cpuinfo_linux.go index d87bd6b3e..4cd64c704 100644 --- a/vendor/golang.org/x/sys/cpu/proc_cpuinfo_linux.go +++ b/vendor/golang.org/x/sys/cpu/proc_cpuinfo_linux.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && arm64 -// +build linux,arm64 package cpu diff --git a/vendor/golang.org/x/sys/cpu/runtime_auxv_go121.go b/vendor/golang.org/x/sys/cpu/runtime_auxv_go121.go index b975ea2a0..4c9788ea8 100644 --- a/vendor/golang.org/x/sys/cpu/runtime_auxv_go121.go +++ b/vendor/golang.org/x/sys/cpu/runtime_auxv_go121.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.21 -// +build go1.21 package cpu diff --git a/vendor/golang.org/x/sys/cpu/syscall_aix_gccgo.go b/vendor/golang.org/x/sys/cpu/syscall_aix_gccgo.go index 96134157a..1b9ccb091 100644 --- a/vendor/golang.org/x/sys/cpu/syscall_aix_gccgo.go +++ b/vendor/golang.org/x/sys/cpu/syscall_aix_gccgo.go @@ -9,7 +9,6 @@ // gccgo's libgo and thus must not used a CGo method. //go:build aix && gccgo -// +build aix,gccgo package cpu diff --git a/vendor/golang.org/x/sys/cpu/syscall_aix_ppc64_gc.go b/vendor/golang.org/x/sys/cpu/syscall_aix_ppc64_gc.go index 904be42ff..e8b6cdbe9 100644 --- a/vendor/golang.org/x/sys/cpu/syscall_aix_ppc64_gc.go +++ b/vendor/golang.org/x/sys/cpu/syscall_aix_ppc64_gc.go @@ -7,7 +7,6 @@ // (See golang.org/issue/32102) //go:build aix && ppc64 && gc -// +build aix,ppc64,gc package cpu diff --git a/vendor/golang.org/x/sys/execabs/execabs_go118.go b/vendor/golang.org/x/sys/execabs/execabs_go118.go index 2000064a8..5627d70e3 100644 --- a/vendor/golang.org/x/sys/execabs/execabs_go118.go +++ b/vendor/golang.org/x/sys/execabs/execabs_go118.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !go1.19 -// +build !go1.19 package execabs diff --git a/vendor/golang.org/x/sys/execabs/execabs_go119.go b/vendor/golang.org/x/sys/execabs/execabs_go119.go index f364b3418..d60ab1b41 100644 --- a/vendor/golang.org/x/sys/execabs/execabs_go119.go +++ b/vendor/golang.org/x/sys/execabs/execabs_go119.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.19 -// +build go1.19 package execabs diff --git a/vendor/golang.org/x/sys/internal/unsafeheader/unsafeheader.go b/vendor/golang.org/x/sys/internal/unsafeheader/unsafeheader.go deleted file mode 100644 index e07899b90..000000000 --- a/vendor/golang.org/x/sys/internal/unsafeheader/unsafeheader.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2020 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package unsafeheader contains header declarations for the Go runtime's -// slice and string implementations. -// -// This package allows x/sys to use types equivalent to -// reflect.SliceHeader and reflect.StringHeader without introducing -// a dependency on the (relatively heavy) "reflect" package. -package unsafeheader - -import ( - "unsafe" -) - -// Slice is the runtime representation of a slice. -// It cannot be used safely or portably and its representation may change in a later release. -type Slice struct { - Data unsafe.Pointer - Len int - Cap int -} - -// String is the runtime representation of a string. -// It cannot be used safely or portably and its representation may change in a later release. -type String struct { - Data unsafe.Pointer - Len int -} diff --git a/vendor/golang.org/x/sys/plan9/pwd_go15_plan9.go b/vendor/golang.org/x/sys/plan9/pwd_go15_plan9.go index c9b69937a..73687de74 100644 --- a/vendor/golang.org/x/sys/plan9/pwd_go15_plan9.go +++ b/vendor/golang.org/x/sys/plan9/pwd_go15_plan9.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build go1.5 -// +build go1.5 package plan9 diff --git a/vendor/golang.org/x/sys/plan9/pwd_plan9.go b/vendor/golang.org/x/sys/plan9/pwd_plan9.go index 98bf56b73..fb9458218 100644 --- a/vendor/golang.org/x/sys/plan9/pwd_plan9.go +++ b/vendor/golang.org/x/sys/plan9/pwd_plan9.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !go1.5 -// +build !go1.5 package plan9 diff --git a/vendor/golang.org/x/sys/plan9/race.go b/vendor/golang.org/x/sys/plan9/race.go index 62377d2ff..c02d9ed33 100644 --- a/vendor/golang.org/x/sys/plan9/race.go +++ b/vendor/golang.org/x/sys/plan9/race.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build plan9 && race -// +build plan9,race package plan9 diff --git a/vendor/golang.org/x/sys/plan9/race0.go b/vendor/golang.org/x/sys/plan9/race0.go index f8da30876..7b15e15f6 100644 --- a/vendor/golang.org/x/sys/plan9/race0.go +++ b/vendor/golang.org/x/sys/plan9/race0.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build plan9 && !race -// +build plan9,!race package plan9 diff --git a/vendor/golang.org/x/sys/plan9/str.go b/vendor/golang.org/x/sys/plan9/str.go index 55fa8d025..ba3e8ff8a 100644 --- a/vendor/golang.org/x/sys/plan9/str.go +++ b/vendor/golang.org/x/sys/plan9/str.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build plan9 -// +build plan9 package plan9 diff --git a/vendor/golang.org/x/sys/plan9/syscall.go b/vendor/golang.org/x/sys/plan9/syscall.go index 67e5b0115..d631fd664 100644 --- a/vendor/golang.org/x/sys/plan9/syscall.go +++ b/vendor/golang.org/x/sys/plan9/syscall.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build plan9 -// +build plan9 // Package plan9 contains an interface to the low-level operating system // primitives. OS details vary depending on the underlying system, and diff --git a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_386.go b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_386.go index 3f40b9bd7..f780d5c80 100644 --- a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_386.go +++ b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build plan9 && 386 -// +build plan9,386 package plan9 diff --git a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_amd64.go b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_amd64.go index 0e6a96aa4..7de61065f 100644 --- a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_amd64.go +++ b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build plan9 && amd64 -// +build plan9,amd64 package plan9 diff --git a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_arm.go b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_arm.go index 244c501b7..ea85780f0 100644 --- a/vendor/golang.org/x/sys/plan9/zsyscall_plan9_arm.go +++ b/vendor/golang.org/x/sys/plan9/zsyscall_plan9_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build plan9 && arm -// +build plan9,arm package plan9 diff --git a/vendor/golang.org/x/sys/unix/aliases.go b/vendor/golang.org/x/sys/unix/aliases.go index abc89c104..b0e419857 100644 --- a/vendor/golang.org/x/sys/unix/aliases.go +++ b/vendor/golang.org/x/sys/unix/aliases.go @@ -2,9 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos) && go1.9 -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos -// +build go1.9 +//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos package unix diff --git a/vendor/golang.org/x/sys/unix/asm_aix_ppc64.s b/vendor/golang.org/x/sys/unix/asm_aix_ppc64.s index db9171c2e..269e173ca 100644 --- a/vendor/golang.org/x/sys/unix/asm_aix_ppc64.s +++ b/vendor/golang.org/x/sys/unix/asm_aix_ppc64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_386.s b/vendor/golang.org/x/sys/unix/asm_bsd_386.s index e0fcd9b3d..a4fcef0e0 100644 --- a/vendor/golang.org/x/sys/unix/asm_bsd_386.s +++ b/vendor/golang.org/x/sys/unix/asm_bsd_386.s @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (freebsd || netbsd || openbsd) && gc -// +build freebsd netbsd openbsd -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_amd64.s b/vendor/golang.org/x/sys/unix/asm_bsd_amd64.s index 2b99c349a..1e63615c5 100644 --- a/vendor/golang.org/x/sys/unix/asm_bsd_amd64.s +++ b/vendor/golang.org/x/sys/unix/asm_bsd_amd64.s @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (darwin || dragonfly || freebsd || netbsd || openbsd) && gc -// +build darwin dragonfly freebsd netbsd openbsd -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_arm.s b/vendor/golang.org/x/sys/unix/asm_bsd_arm.s index d702d4adc..6496c3100 100644 --- a/vendor/golang.org/x/sys/unix/asm_bsd_arm.s +++ b/vendor/golang.org/x/sys/unix/asm_bsd_arm.s @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (freebsd || netbsd || openbsd) && gc -// +build freebsd netbsd openbsd -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_arm64.s b/vendor/golang.org/x/sys/unix/asm_bsd_arm64.s index fe36a7391..4fd1f54da 100644 --- a/vendor/golang.org/x/sys/unix/asm_bsd_arm64.s +++ b/vendor/golang.org/x/sys/unix/asm_bsd_arm64.s @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (darwin || freebsd || netbsd || openbsd) && gc -// +build darwin freebsd netbsd openbsd -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_ppc64.s b/vendor/golang.org/x/sys/unix/asm_bsd_ppc64.s index e5b9a8489..42f7eb9e4 100644 --- a/vendor/golang.org/x/sys/unix/asm_bsd_ppc64.s +++ b/vendor/golang.org/x/sys/unix/asm_bsd_ppc64.s @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (darwin || freebsd || netbsd || openbsd) && gc -// +build darwin freebsd netbsd openbsd -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_bsd_riscv64.s b/vendor/golang.org/x/sys/unix/asm_bsd_riscv64.s index d560019ea..f8902667e 100644 --- a/vendor/golang.org/x/sys/unix/asm_bsd_riscv64.s +++ b/vendor/golang.org/x/sys/unix/asm_bsd_riscv64.s @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (darwin || freebsd || netbsd || openbsd) && gc -// +build darwin freebsd netbsd openbsd -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_linux_386.s b/vendor/golang.org/x/sys/unix/asm_linux_386.s index 8fd101d07..3b4734870 100644 --- a/vendor/golang.org/x/sys/unix/asm_linux_386.s +++ b/vendor/golang.org/x/sys/unix/asm_linux_386.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_linux_amd64.s b/vendor/golang.org/x/sys/unix/asm_linux_amd64.s index 7ed38e43c..67e29f317 100644 --- a/vendor/golang.org/x/sys/unix/asm_linux_amd64.s +++ b/vendor/golang.org/x/sys/unix/asm_linux_amd64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_linux_arm.s b/vendor/golang.org/x/sys/unix/asm_linux_arm.s index 8ef1d5140..d6ae269ce 100644 --- a/vendor/golang.org/x/sys/unix/asm_linux_arm.s +++ b/vendor/golang.org/x/sys/unix/asm_linux_arm.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_linux_arm64.s b/vendor/golang.org/x/sys/unix/asm_linux_arm64.s index 98ae02760..01e5e253c 100644 --- a/vendor/golang.org/x/sys/unix/asm_linux_arm64.s +++ b/vendor/golang.org/x/sys/unix/asm_linux_arm64.s @@ -3,9 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && arm64 && gc -// +build linux -// +build arm64 -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_linux_loong64.s b/vendor/golang.org/x/sys/unix/asm_linux_loong64.s index 565357288..2abf12f6e 100644 --- a/vendor/golang.org/x/sys/unix/asm_linux_loong64.s +++ b/vendor/golang.org/x/sys/unix/asm_linux_loong64.s @@ -3,9 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && loong64 && gc -// +build linux -// +build loong64 -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_linux_mips64x.s b/vendor/golang.org/x/sys/unix/asm_linux_mips64x.s index 21231d2ce..f84bae712 100644 --- a/vendor/golang.org/x/sys/unix/asm_linux_mips64x.s +++ b/vendor/golang.org/x/sys/unix/asm_linux_mips64x.s @@ -3,9 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && (mips64 || mips64le) && gc -// +build linux -// +build mips64 mips64le -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_linux_mipsx.s b/vendor/golang.org/x/sys/unix/asm_linux_mipsx.s index 6783b26c6..f08f62807 100644 --- a/vendor/golang.org/x/sys/unix/asm_linux_mipsx.s +++ b/vendor/golang.org/x/sys/unix/asm_linux_mipsx.s @@ -3,9 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && (mips || mipsle) && gc -// +build linux -// +build mips mipsle -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_linux_ppc64x.s b/vendor/golang.org/x/sys/unix/asm_linux_ppc64x.s index 19d498934..bdfc024d2 100644 --- a/vendor/golang.org/x/sys/unix/asm_linux_ppc64x.s +++ b/vendor/golang.org/x/sys/unix/asm_linux_ppc64x.s @@ -3,9 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && (ppc64 || ppc64le) && gc -// +build linux -// +build ppc64 ppc64le -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_linux_riscv64.s b/vendor/golang.org/x/sys/unix/asm_linux_riscv64.s index e42eb81d5..2e8c99612 100644 --- a/vendor/golang.org/x/sys/unix/asm_linux_riscv64.s +++ b/vendor/golang.org/x/sys/unix/asm_linux_riscv64.s @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build riscv64 && gc -// +build riscv64 -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_linux_s390x.s b/vendor/golang.org/x/sys/unix/asm_linux_s390x.s index c46aab339..2c394b11e 100644 --- a/vendor/golang.org/x/sys/unix/asm_linux_s390x.s +++ b/vendor/golang.org/x/sys/unix/asm_linux_s390x.s @@ -3,9 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && s390x && gc -// +build linux -// +build s390x -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_openbsd_mips64.s b/vendor/golang.org/x/sys/unix/asm_openbsd_mips64.s index 5e7a1169c..fab586a2c 100644 --- a/vendor/golang.org/x/sys/unix/asm_openbsd_mips64.s +++ b/vendor/golang.org/x/sys/unix/asm_openbsd_mips64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_solaris_amd64.s b/vendor/golang.org/x/sys/unix/asm_solaris_amd64.s index f8c5394c1..f949ec547 100644 --- a/vendor/golang.org/x/sys/unix/asm_solaris_amd64.s +++ b/vendor/golang.org/x/sys/unix/asm_solaris_amd64.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gc -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/asm_zos_s390x.s b/vendor/golang.org/x/sys/unix/asm_zos_s390x.s index 3b54e1858..2f67ba86d 100644 --- a/vendor/golang.org/x/sys/unix/asm_zos_s390x.s +++ b/vendor/golang.org/x/sys/unix/asm_zos_s390x.s @@ -3,9 +3,6 @@ // license that can be found in the LICENSE file. //go:build zos && s390x && gc -// +build zos -// +build s390x -// +build gc #include "textflag.h" diff --git a/vendor/golang.org/x/sys/unix/cap_freebsd.go b/vendor/golang.org/x/sys/unix/cap_freebsd.go index 0b7c6adb8..a08657890 100644 --- a/vendor/golang.org/x/sys/unix/cap_freebsd.go +++ b/vendor/golang.org/x/sys/unix/cap_freebsd.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build freebsd -// +build freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/constants.go b/vendor/golang.org/x/sys/unix/constants.go index 394a3965b..6fb7cb77d 100644 --- a/vendor/golang.org/x/sys/unix/constants.go +++ b/vendor/golang.org/x/sys/unix/constants.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos package unix diff --git a/vendor/golang.org/x/sys/unix/dev_aix_ppc.go b/vendor/golang.org/x/sys/unix/dev_aix_ppc.go index 65a998508..d78513461 100644 --- a/vendor/golang.org/x/sys/unix/dev_aix_ppc.go +++ b/vendor/golang.org/x/sys/unix/dev_aix_ppc.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix && ppc -// +build aix,ppc // Functions to access/create device major and minor numbers matching the // encoding used by AIX. diff --git a/vendor/golang.org/x/sys/unix/dev_aix_ppc64.go b/vendor/golang.org/x/sys/unix/dev_aix_ppc64.go index 8fc08ad0a..623a5e697 100644 --- a/vendor/golang.org/x/sys/unix/dev_aix_ppc64.go +++ b/vendor/golang.org/x/sys/unix/dev_aix_ppc64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix && ppc64 -// +build aix,ppc64 // Functions to access/create device major and minor numbers matching the // encoding used AIX. diff --git a/vendor/golang.org/x/sys/unix/dev_zos.go b/vendor/golang.org/x/sys/unix/dev_zos.go index a388e59a0..bb6a64fe9 100644 --- a/vendor/golang.org/x/sys/unix/dev_zos.go +++ b/vendor/golang.org/x/sys/unix/dev_zos.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build zos && s390x -// +build zos,s390x // Functions to access/create device major and minor numbers matching the // encoding used by z/OS. diff --git a/vendor/golang.org/x/sys/unix/dirent.go b/vendor/golang.org/x/sys/unix/dirent.go index 2499f977b..1ebf11782 100644 --- a/vendor/golang.org/x/sys/unix/dirent.go +++ b/vendor/golang.org/x/sys/unix/dirent.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos package unix diff --git a/vendor/golang.org/x/sys/unix/endian_big.go b/vendor/golang.org/x/sys/unix/endian_big.go index a52026557..1095fd31d 100644 --- a/vendor/golang.org/x/sys/unix/endian_big.go +++ b/vendor/golang.org/x/sys/unix/endian_big.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. // //go:build armbe || arm64be || m68k || mips || mips64 || mips64p32 || ppc || ppc64 || s390 || s390x || shbe || sparc || sparc64 -// +build armbe arm64be m68k mips mips64 mips64p32 ppc ppc64 s390 s390x shbe sparc sparc64 package unix diff --git a/vendor/golang.org/x/sys/unix/endian_little.go b/vendor/golang.org/x/sys/unix/endian_little.go index b0f2bc4ae..b9f0e277b 100644 --- a/vendor/golang.org/x/sys/unix/endian_little.go +++ b/vendor/golang.org/x/sys/unix/endian_little.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. // //go:build 386 || amd64 || amd64p32 || alpha || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || nios2 || ppc64le || riscv || riscv64 || sh -// +build 386 amd64 amd64p32 alpha arm arm64 loong64 mipsle mips64le mips64p32le nios2 ppc64le riscv riscv64 sh package unix diff --git a/vendor/golang.org/x/sys/unix/env_unix.go b/vendor/golang.org/x/sys/unix/env_unix.go index 29ccc4d13..a96da71f4 100644 --- a/vendor/golang.org/x/sys/unix/env_unix.go +++ b/vendor/golang.org/x/sys/unix/env_unix.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos // Unix environment variables. diff --git a/vendor/golang.org/x/sys/unix/epoll_zos.go b/vendor/golang.org/x/sys/unix/epoll_zos.go index cedaf7e02..7753fddea 100644 --- a/vendor/golang.org/x/sys/unix/epoll_zos.go +++ b/vendor/golang.org/x/sys/unix/epoll_zos.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build zos && s390x -// +build zos,s390x package unix diff --git a/vendor/golang.org/x/sys/unix/fcntl.go b/vendor/golang.org/x/sys/unix/fcntl.go index e9b991258..6200876fb 100644 --- a/vendor/golang.org/x/sys/unix/fcntl.go +++ b/vendor/golang.org/x/sys/unix/fcntl.go @@ -2,8 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build dragonfly || freebsd || linux || netbsd || openbsd -// +build dragonfly freebsd linux netbsd openbsd +//go:build dragonfly || freebsd || linux || netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/fcntl_linux_32bit.go b/vendor/golang.org/x/sys/unix/fcntl_linux_32bit.go index 29d44808b..13b4acd5c 100644 --- a/vendor/golang.org/x/sys/unix/fcntl_linux_32bit.go +++ b/vendor/golang.org/x/sys/unix/fcntl_linux_32bit.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build (linux && 386) || (linux && arm) || (linux && mips) || (linux && mipsle) || (linux && ppc) -// +build linux,386 linux,arm linux,mips linux,mipsle linux,ppc package unix diff --git a/vendor/golang.org/x/sys/unix/fdset.go b/vendor/golang.org/x/sys/unix/fdset.go index a8068f94f..9e83d18cd 100644 --- a/vendor/golang.org/x/sys/unix/fdset.go +++ b/vendor/golang.org/x/sys/unix/fdset.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos package unix diff --git a/vendor/golang.org/x/sys/unix/fstatfs_zos.go b/vendor/golang.org/x/sys/unix/fstatfs_zos.go index e377cc9f4..c8bde601e 100644 --- a/vendor/golang.org/x/sys/unix/fstatfs_zos.go +++ b/vendor/golang.org/x/sys/unix/fstatfs_zos.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build zos && s390x -// +build zos,s390x package unix diff --git a/vendor/golang.org/x/sys/unix/gccgo.go b/vendor/golang.org/x/sys/unix/gccgo.go index b06f52d74..aca5721dd 100644 --- a/vendor/golang.org/x/sys/unix/gccgo.go +++ b/vendor/golang.org/x/sys/unix/gccgo.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gccgo && !aix && !hurd -// +build gccgo,!aix,!hurd package unix diff --git a/vendor/golang.org/x/sys/unix/gccgo_c.c b/vendor/golang.org/x/sys/unix/gccgo_c.c index f98a1c542..d468b7b47 100644 --- a/vendor/golang.org/x/sys/unix/gccgo_c.c +++ b/vendor/golang.org/x/sys/unix/gccgo_c.c @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gccgo && !aix && !hurd -// +build gccgo,!aix,!hurd #include #include diff --git a/vendor/golang.org/x/sys/unix/gccgo_linux_amd64.go b/vendor/golang.org/x/sys/unix/gccgo_linux_amd64.go index e60e49a3d..972d61bd7 100644 --- a/vendor/golang.org/x/sys/unix/gccgo_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/gccgo_linux_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build gccgo && linux && amd64 -// +build gccgo,linux,amd64 package unix diff --git a/vendor/golang.org/x/sys/unix/ifreq_linux.go b/vendor/golang.org/x/sys/unix/ifreq_linux.go index 15721a510..848840ae4 100644 --- a/vendor/golang.org/x/sys/unix/ifreq_linux.go +++ b/vendor/golang.org/x/sys/unix/ifreq_linux.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux -// +build linux package unix diff --git a/vendor/golang.org/x/sys/unix/ioctl_linux.go b/vendor/golang.org/x/sys/unix/ioctl_linux.go index 0d12c0851..dbe680eab 100644 --- a/vendor/golang.org/x/sys/unix/ioctl_linux.go +++ b/vendor/golang.org/x/sys/unix/ioctl_linux.go @@ -231,3 +231,8 @@ func IoctlLoopGetStatus64(fd int) (*LoopInfo64, error) { func IoctlLoopSetStatus64(fd int, value *LoopInfo64) error { return ioctlPtr(fd, LOOP_SET_STATUS64, unsafe.Pointer(value)) } + +// IoctlLoopConfigure configures all loop device parameters in a single step +func IoctlLoopConfigure(fd int, value *LoopConfig) error { + return ioctlPtr(fd, LOOP_CONFIGURE, unsafe.Pointer(value)) +} diff --git a/vendor/golang.org/x/sys/unix/ioctl_signed.go b/vendor/golang.org/x/sys/unix/ioctl_signed.go index 7def9580e..5b0759bd8 100644 --- a/vendor/golang.org/x/sys/unix/ioctl_signed.go +++ b/vendor/golang.org/x/sys/unix/ioctl_signed.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || solaris -// +build aix solaris package unix diff --git a/vendor/golang.org/x/sys/unix/ioctl_unsigned.go b/vendor/golang.org/x/sys/unix/ioctl_unsigned.go index 649913d1e..20f470b9d 100644 --- a/vendor/golang.org/x/sys/unix/ioctl_unsigned.go +++ b/vendor/golang.org/x/sys/unix/ioctl_unsigned.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build darwin || dragonfly || freebsd || hurd || linux || netbsd || openbsd -// +build darwin dragonfly freebsd hurd linux netbsd openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ioctl_zos.go b/vendor/golang.org/x/sys/unix/ioctl_zos.go index cdc21bf76..c8b2a750f 100644 --- a/vendor/golang.org/x/sys/unix/ioctl_zos.go +++ b/vendor/golang.org/x/sys/unix/ioctl_zos.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build zos && s390x -// +build zos,s390x package unix diff --git a/vendor/golang.org/x/sys/unix/mkerrors.sh b/vendor/golang.org/x/sys/unix/mkerrors.sh index 8f775fafa..fdcaa974d 100644 --- a/vendor/golang.org/x/sys/unix/mkerrors.sh +++ b/vendor/golang.org/x/sys/unix/mkerrors.sh @@ -248,6 +248,7 @@ struct ltchars { #include #include #include +#include #include #include #include @@ -283,10 +284,6 @@ struct ltchars { #include #endif -#ifndef MSG_FASTOPEN -#define MSG_FASTOPEN 0x20000000 -#endif - #ifndef PTRACE_GETREGS #define PTRACE_GETREGS 0xc #endif @@ -295,14 +292,6 @@ struct ltchars { #define PTRACE_SETREGS 0xd #endif -#ifndef SOL_NETLINK -#define SOL_NETLINK 270 -#endif - -#ifndef SOL_SMC -#define SOL_SMC 286 -#endif - #ifdef SOL_BLUETOOTH // SPARC includes this in /usr/include/sparc64-linux-gnu/bits/socket.h // but it is already in bluetooth_linux.go @@ -319,10 +308,23 @@ struct ltchars { #undef TIPC_WAIT_FOREVER #define TIPC_WAIT_FOREVER 0xffffffff -// Copied from linux/l2tp.h -// Including linux/l2tp.h here causes conflicts between linux/in.h -// and netinet/in.h included via net/route.h above. -#define IPPROTO_L2TP 115 +// Copied from linux/netfilter/nf_nat.h +// Including linux/netfilter/nf_nat.h here causes conflicts between linux/in.h +// and netinet/in.h. +#define NF_NAT_RANGE_MAP_IPS (1 << 0) +#define NF_NAT_RANGE_PROTO_SPECIFIED (1 << 1) +#define NF_NAT_RANGE_PROTO_RANDOM (1 << 2) +#define NF_NAT_RANGE_PERSISTENT (1 << 3) +#define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4) +#define NF_NAT_RANGE_PROTO_OFFSET (1 << 5) +#define NF_NAT_RANGE_NETMAP (1 << 6) +#define NF_NAT_RANGE_PROTO_RANDOM_ALL \ + (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) +#define NF_NAT_RANGE_MASK \ + (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ + NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ + NF_NAT_RANGE_PROTO_RANDOM_FULLY | NF_NAT_RANGE_PROTO_OFFSET | \ + NF_NAT_RANGE_NETMAP) // Copied from linux/hid.h. // Keep in sync with the size of the referenced fields. @@ -519,6 +521,7 @@ ccflags="$@" $2 ~ /^LOCK_(SH|EX|NB|UN)$/ || $2 ~ /^LO_(KEY|NAME)_SIZE$/ || $2 ~ /^LOOP_(CLR|CTL|GET|SET)_/ || + $2 == "LOOP_CONFIGURE" || $2 ~ /^(AF|SOCK|SO|SOL|IPPROTO|IP|IPV6|TCP|MCAST|EVFILT|NOTE|SHUT|PROT|MAP|MREMAP|MFD|T?PACKET|MSG|SCM|MCL|DT|MADV|PR|LOCAL|TCPOPT|UDP)_/ || $2 ~ /^NFC_(GENL|PROTO|COMM|RF|SE|DIRECTION|LLCP|SOCKPROTO)_/ || $2 ~ /^NFC_.*_(MAX)?SIZE$/ || @@ -560,7 +563,7 @@ ccflags="$@" $2 ~ /^RLIMIT_(AS|CORE|CPU|DATA|FSIZE|LOCKS|MEMLOCK|MSGQUEUE|NICE|NOFILE|NPROC|RSS|RTPRIO|RTTIME|SIGPENDING|STACK)|RLIM_INFINITY/ || $2 ~ /^PRIO_(PROCESS|PGRP|USER)/ || $2 ~ /^CLONE_[A-Z_]+/ || - $2 !~ /^(BPF_TIMEVAL|BPF_FIB_LOOKUP_[A-Z]+)$/ && + $2 !~ /^(BPF_TIMEVAL|BPF_FIB_LOOKUP_[A-Z]+|BPF_F_LINK)$/ && $2 ~ /^(BPF|DLT)_/ || $2 ~ /^AUDIT_/ || $2 ~ /^(CLOCK|TIMER)_/ || @@ -581,8 +584,9 @@ ccflags="$@" $2 ~ /^KEY_(SPEC|REQKEY_DEFL)_/ || $2 ~ /^KEYCTL_/ || $2 ~ /^PERF_/ || - $2 ~ /^SECCOMP_MODE_/ || + $2 ~ /^SECCOMP_/ || $2 ~ /^SEEK_/ || + $2 ~ /^SCHED_/ || $2 ~ /^SPLICE_/ || $2 ~ /^SYNC_FILE_RANGE_/ || $2 !~ /IOC_MAGIC/ && @@ -601,6 +605,9 @@ ccflags="$@" $2 ~ /^FSOPT_/ || $2 ~ /^WDIO[CFS]_/ || $2 ~ /^NFN/ || + $2 !~ /^NFT_META_IIFTYPE/ && + $2 ~ /^NFT_/ || + $2 ~ /^NF_NAT_/ || $2 ~ /^XDP_/ || $2 ~ /^RWF_/ || $2 ~ /^(HDIO|WIN|SMART)_/ || @@ -662,7 +669,6 @@ echo '// mkerrors.sh' "$@" echo '// Code generated by the command above; see README.md. DO NOT EDIT.' echo echo "//go:build ${GOARCH} && ${GOOS}" -echo "// +build ${GOARCH},${GOOS}" echo go tool cgo -godefs -- "$@" _const.go >_error.out cat _error.out | grep -vf _error.grep | grep -vf _signal.grep diff --git a/vendor/golang.org/x/sys/unix/mmap_nomremap.go b/vendor/golang.org/x/sys/unix/mmap_nomremap.go index ca0513632..4b68e5978 100644 --- a/vendor/golang.org/x/sys/unix/mmap_nomremap.go +++ b/vendor/golang.org/x/sys/unix/mmap_nomremap.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || openbsd || solaris -// +build aix darwin dragonfly freebsd openbsd solaris package unix diff --git a/vendor/golang.org/x/sys/unix/mremap.go b/vendor/golang.org/x/sys/unix/mremap.go index fa93d0aa9..fd45fe529 100644 --- a/vendor/golang.org/x/sys/unix/mremap.go +++ b/vendor/golang.org/x/sys/unix/mremap.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux || netbsd -// +build linux netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/pagesize_unix.go b/vendor/golang.org/x/sys/unix/pagesize_unix.go index 53f1b4c5b..4d0a3430e 100644 --- a/vendor/golang.org/x/sys/unix/pagesize_unix.go +++ b/vendor/golang.org/x/sys/unix/pagesize_unix.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris // For Unix, get the pagesize from the runtime. diff --git a/vendor/golang.org/x/sys/unix/pledge_openbsd.go b/vendor/golang.org/x/sys/unix/pledge_openbsd.go index eb48294b2..6a09af53e 100644 --- a/vendor/golang.org/x/sys/unix/pledge_openbsd.go +++ b/vendor/golang.org/x/sys/unix/pledge_openbsd.go @@ -8,54 +8,31 @@ import ( "errors" "fmt" "strconv" - "syscall" - "unsafe" ) // Pledge implements the pledge syscall. // -// The pledge syscall does not accept execpromises on OpenBSD releases -// before 6.3. -// -// execpromises must be empty when Pledge is called on OpenBSD -// releases predating 6.3, otherwise an error will be returned. +// This changes both the promises and execpromises; use PledgePromises or +// PledgeExecpromises to only change the promises or execpromises +// respectively. // // For more information see pledge(2). func Pledge(promises, execpromises string) error { - maj, min, err := majmin() + if err := pledgeAvailable(); err != nil { + return err + } + + pptr, err := BytePtrFromString(promises) if err != nil { return err } - err = pledgeAvailable(maj, min, execpromises) + exptr, err := BytePtrFromString(execpromises) if err != nil { return err } - pptr, err := syscall.BytePtrFromString(promises) - if err != nil { - return err - } - - // This variable will hold either a nil unsafe.Pointer or - // an unsafe.Pointer to a string (execpromises). - var expr unsafe.Pointer - - // If we're running on OpenBSD > 6.2, pass execpromises to the syscall. - if maj > 6 || (maj == 6 && min > 2) { - exptr, err := syscall.BytePtrFromString(execpromises) - if err != nil { - return err - } - expr = unsafe.Pointer(exptr) - } - - _, _, e := syscall.Syscall(SYS_PLEDGE, uintptr(unsafe.Pointer(pptr)), uintptr(expr), 0) - if e != 0 { - return e - } - - return nil + return pledge(pptr, exptr) } // PledgePromises implements the pledge syscall. @@ -64,30 +41,16 @@ func Pledge(promises, execpromises string) error { // // For more information see pledge(2). func PledgePromises(promises string) error { - maj, min, err := majmin() + if err := pledgeAvailable(); err != nil { + return err + } + + pptr, err := BytePtrFromString(promises) if err != nil { return err } - err = pledgeAvailable(maj, min, "") - if err != nil { - return err - } - - // This variable holds the execpromises and is always nil. - var expr unsafe.Pointer - - pptr, err := syscall.BytePtrFromString(promises) - if err != nil { - return err - } - - _, _, e := syscall.Syscall(SYS_PLEDGE, uintptr(unsafe.Pointer(pptr)), uintptr(expr), 0) - if e != 0 { - return e - } - - return nil + return pledge(pptr, nil) } // PledgeExecpromises implements the pledge syscall. @@ -96,30 +59,16 @@ func PledgePromises(promises string) error { // // For more information see pledge(2). func PledgeExecpromises(execpromises string) error { - maj, min, err := majmin() + if err := pledgeAvailable(); err != nil { + return err + } + + exptr, err := BytePtrFromString(execpromises) if err != nil { return err } - err = pledgeAvailable(maj, min, execpromises) - if err != nil { - return err - } - - // This variable holds the promises and is always nil. - var pptr unsafe.Pointer - - exptr, err := syscall.BytePtrFromString(execpromises) - if err != nil { - return err - } - - _, _, e := syscall.Syscall(SYS_PLEDGE, uintptr(pptr), uintptr(unsafe.Pointer(exptr)), 0) - if e != 0 { - return e - } - - return nil + return pledge(nil, exptr) } // majmin returns major and minor version number for an OpenBSD system. @@ -147,16 +96,15 @@ func majmin() (major int, minor int, err error) { // pledgeAvailable checks for availability of the pledge(2) syscall // based on the running OpenBSD version. -func pledgeAvailable(maj, min int, execpromises string) error { - // If OpenBSD <= 5.9, pledge is not available. - if (maj == 5 && min != 9) || maj < 5 { - return fmt.Errorf("pledge syscall is not available on OpenBSD %d.%d", maj, min) +func pledgeAvailable() error { + maj, min, err := majmin() + if err != nil { + return err } - // If OpenBSD <= 6.2 and execpromises is not empty, - // return an error - execpromises is not available before 6.3 - if (maj < 6 || (maj == 6 && min <= 2)) && execpromises != "" { - return fmt.Errorf("cannot use execpromises on OpenBSD %d.%d", maj, min) + // Require OpenBSD 6.4 as a minimum. + if maj < 6 || (maj == 6 && min <= 3) { + return fmt.Errorf("cannot call Pledge on OpenBSD %d.%d", maj, min) } return nil diff --git a/vendor/golang.org/x/sys/unix/ptrace_darwin.go b/vendor/golang.org/x/sys/unix/ptrace_darwin.go index 39dba6ca6..3f0975f3d 100644 --- a/vendor/golang.org/x/sys/unix/ptrace_darwin.go +++ b/vendor/golang.org/x/sys/unix/ptrace_darwin.go @@ -3,16 +3,9 @@ // license that can be found in the LICENSE file. //go:build darwin && !ios -// +build darwin,!ios package unix -import "unsafe" - func ptrace(request int, pid int, addr uintptr, data uintptr) error { return ptrace1(request, pid, addr, data) } - -func ptracePtr(request int, pid int, addr uintptr, data unsafe.Pointer) error { - return ptrace1Ptr(request, pid, addr, data) -} diff --git a/vendor/golang.org/x/sys/unix/ptrace_ios.go b/vendor/golang.org/x/sys/unix/ptrace_ios.go index 9ea66330a..a4d35db5d 100644 --- a/vendor/golang.org/x/sys/unix/ptrace_ios.go +++ b/vendor/golang.org/x/sys/unix/ptrace_ios.go @@ -3,16 +3,9 @@ // license that can be found in the LICENSE file. //go:build ios -// +build ios package unix -import "unsafe" - func ptrace(request int, pid int, addr uintptr, data uintptr) (err error) { return ENOTSUP } - -func ptracePtr(request int, pid int, addr uintptr, data unsafe.Pointer) (err error) { - return ENOTSUP -} diff --git a/vendor/golang.org/x/sys/unix/race.go b/vendor/golang.org/x/sys/unix/race.go index 6f6c5fec5..714d2aae7 100644 --- a/vendor/golang.org/x/sys/unix/race.go +++ b/vendor/golang.org/x/sys/unix/race.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build (darwin && race) || (linux && race) || (freebsd && race) -// +build darwin,race linux,race freebsd,race package unix diff --git a/vendor/golang.org/x/sys/unix/race0.go b/vendor/golang.org/x/sys/unix/race0.go index 706e1322a..4a9f6634c 100644 --- a/vendor/golang.org/x/sys/unix/race0.go +++ b/vendor/golang.org/x/sys/unix/race0.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || (darwin && !race) || (linux && !race) || (freebsd && !race) || netbsd || openbsd || solaris || dragonfly || zos -// +build aix darwin,!race linux,!race freebsd,!race netbsd openbsd solaris dragonfly zos package unix diff --git a/vendor/golang.org/x/sys/unix/readdirent_getdents.go b/vendor/golang.org/x/sys/unix/readdirent_getdents.go index 4d6257569..dbd2b6ccb 100644 --- a/vendor/golang.org/x/sys/unix/readdirent_getdents.go +++ b/vendor/golang.org/x/sys/unix/readdirent_getdents.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || dragonfly || freebsd || linux || netbsd || openbsd -// +build aix dragonfly freebsd linux netbsd openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/readdirent_getdirentries.go b/vendor/golang.org/x/sys/unix/readdirent_getdirentries.go index 2a4ba47c4..130398b6b 100644 --- a/vendor/golang.org/x/sys/unix/readdirent_getdirentries.go +++ b/vendor/golang.org/x/sys/unix/readdirent_getdirentries.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build darwin -// +build darwin package unix diff --git a/vendor/golang.org/x/sys/unix/sockcmsg_unix.go b/vendor/golang.org/x/sys/unix/sockcmsg_unix.go index 3865943f6..c3a62dbb1 100644 --- a/vendor/golang.org/x/sys/unix/sockcmsg_unix.go +++ b/vendor/golang.org/x/sys/unix/sockcmsg_unix.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos // Socket control messages diff --git a/vendor/golang.org/x/sys/unix/sockcmsg_unix_other.go b/vendor/golang.org/x/sys/unix/sockcmsg_unix_other.go index 0840fe4a5..4a1eab37e 100644 --- a/vendor/golang.org/x/sys/unix/sockcmsg_unix_other.go +++ b/vendor/golang.org/x/sys/unix/sockcmsg_unix_other.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || freebsd || linux || netbsd || openbsd || solaris || zos -// +build aix darwin freebsd linux netbsd openbsd solaris zos package unix diff --git a/vendor/golang.org/x/sys/unix/syscall.go b/vendor/golang.org/x/sys/unix/syscall.go index 63e8c8383..5ea74da98 100644 --- a/vendor/golang.org/x/sys/unix/syscall.go +++ b/vendor/golang.org/x/sys/unix/syscall.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos // Package unix contains an interface to the low-level operating system // primitives. OS details vary depending on the underlying system, and diff --git a/vendor/golang.org/x/sys/unix/syscall_aix.go b/vendor/golang.org/x/sys/unix/syscall_aix.go index 9a6e5acac..67ce6cef2 100644 --- a/vendor/golang.org/x/sys/unix/syscall_aix.go +++ b/vendor/golang.org/x/sys/unix/syscall_aix.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix -// +build aix // Aix system calls. // This file is compiled as ordinary Go code, @@ -107,7 +106,8 @@ func (sa *SockaddrUnix) sockaddr() (unsafe.Pointer, _Socklen, error) { if n > 0 { sl += _Socklen(n) + 1 } - if sa.raw.Path[0] == '@' { + if sa.raw.Path[0] == '@' || (sa.raw.Path[0] == 0 && sl > 3) { + // Check sl > 3 so we don't change unnamed socket behavior. sa.raw.Path[0] = 0 // Don't count trailing NUL for abstract address. sl-- @@ -487,8 +487,6 @@ func Fsync(fd int) error { //sys Unlinkat(dirfd int, path string, flags int) (err error) //sys Ustat(dev int, ubuf *Ustat_t) (err error) //sys write(fd int, p []byte) (n int, err error) -//sys readlen(fd int, p *byte, np int) (n int, err error) = read -//sys writelen(fd int, p *byte, np int) (n int, err error) = write //sys Dup2(oldfd int, newfd int) (err error) //sys Fadvise(fd int, offset int64, length int64, advice int) (err error) = posix_fadvise64 diff --git a/vendor/golang.org/x/sys/unix/syscall_aix_ppc.go b/vendor/golang.org/x/sys/unix/syscall_aix_ppc.go index f2871fa95..1fdaa4760 100644 --- a/vendor/golang.org/x/sys/unix/syscall_aix_ppc.go +++ b/vendor/golang.org/x/sys/unix/syscall_aix_ppc.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix && ppc -// +build aix,ppc package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_aix_ppc64.go b/vendor/golang.org/x/sys/unix/syscall_aix_ppc64.go index 75718ec0f..c87f9a9f4 100644 --- a/vendor/golang.org/x/sys/unix/syscall_aix_ppc64.go +++ b/vendor/golang.org/x/sys/unix/syscall_aix_ppc64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix && ppc64 -// +build aix,ppc64 package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_bsd.go b/vendor/golang.org/x/sys/unix/syscall_bsd.go index 4217de518..a00c3e545 100644 --- a/vendor/golang.org/x/sys/unix/syscall_bsd.go +++ b/vendor/golang.org/x/sys/unix/syscall_bsd.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build darwin || dragonfly || freebsd || netbsd || openbsd -// +build darwin dragonfly freebsd netbsd openbsd // BSD system call wrappers shared by *BSD based systems // including OS X (Darwin) and FreeBSD. Like the other @@ -317,7 +316,7 @@ func GetsockoptString(fd, level, opt int) (string, error) { if err != nil { return "", err } - return string(buf[:vallen-1]), nil + return ByteSliceToString(buf[:vallen]), nil } //sys recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Socklen) (n int, err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin.go b/vendor/golang.org/x/sys/unix/syscall_darwin.go index 135cc3cd7..59542a897 100644 --- a/vendor/golang.org/x/sys/unix/syscall_darwin.go +++ b/vendor/golang.org/x/sys/unix/syscall_darwin.go @@ -644,189 +644,3 @@ func SysctlKinfoProcSlice(name string, args ...int) ([]KinfoProc, error) { //sys write(fd int, p []byte) (n int, err error) //sys mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error) //sys munmap(addr uintptr, length uintptr) (err error) -//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ -//sys writelen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_WRITE - -/* - * Unimplemented - */ -// Profil -// Sigaction -// Sigprocmask -// Getlogin -// Sigpending -// Sigaltstack -// Ioctl -// Reboot -// Execve -// Vfork -// Sbrk -// Sstk -// Ovadvise -// Mincore -// Setitimer -// Swapon -// Select -// Sigsuspend -// Readv -// Writev -// Nfssvc -// Getfh -// Quotactl -// Csops -// Waitid -// Add_profil -// Kdebug_trace -// Sigreturn -// Atsocket -// Kqueue_from_portset_np -// Kqueue_portset -// Getattrlist -// Getdirentriesattr -// Searchfs -// Delete -// Copyfile -// Watchevent -// Waitevent -// Modwatch -// Fsctl -// Initgroups -// Posix_spawn -// Nfsclnt -// Fhopen -// Minherit -// Semsys -// Msgsys -// Shmsys -// Semctl -// Semget -// Semop -// Msgctl -// Msgget -// Msgsnd -// Msgrcv -// Shm_open -// Shm_unlink -// Sem_open -// Sem_close -// Sem_unlink -// Sem_wait -// Sem_trywait -// Sem_post -// Sem_getvalue -// Sem_init -// Sem_destroy -// Open_extended -// Umask_extended -// Stat_extended -// Lstat_extended -// Fstat_extended -// Chmod_extended -// Fchmod_extended -// Access_extended -// Settid -// Gettid -// Setsgroups -// Getsgroups -// Setwgroups -// Getwgroups -// Mkfifo_extended -// Mkdir_extended -// Identitysvc -// Shared_region_check_np -// Shared_region_map_np -// __pthread_mutex_destroy -// __pthread_mutex_init -// __pthread_mutex_lock -// __pthread_mutex_trylock -// __pthread_mutex_unlock -// __pthread_cond_init -// __pthread_cond_destroy -// __pthread_cond_broadcast -// __pthread_cond_signal -// Setsid_with_pid -// __pthread_cond_timedwait -// Aio_fsync -// Aio_return -// Aio_suspend -// Aio_cancel -// Aio_error -// Aio_read -// Aio_write -// Lio_listio -// __pthread_cond_wait -// Iopolicysys -// __pthread_kill -// __pthread_sigmask -// __sigwait -// __disable_threadsignal -// __pthread_markcancel -// __pthread_canceled -// __semwait_signal -// Proc_info -// sendfile -// Stat64_extended -// Lstat64_extended -// Fstat64_extended -// __pthread_chdir -// __pthread_fchdir -// Audit -// Auditon -// Getauid -// Setauid -// Getaudit -// Setaudit -// Getaudit_addr -// Setaudit_addr -// Auditctl -// Bsdthread_create -// Bsdthread_terminate -// Stack_snapshot -// Bsdthread_register -// Workq_open -// Workq_ops -// __mac_execve -// __mac_syscall -// __mac_get_file -// __mac_set_file -// __mac_get_link -// __mac_set_link -// __mac_get_proc -// __mac_set_proc -// __mac_get_fd -// __mac_set_fd -// __mac_get_pid -// __mac_get_lcid -// __mac_get_lctx -// __mac_set_lctx -// Setlcid -// Read_nocancel -// Write_nocancel -// Open_nocancel -// Close_nocancel -// Wait4_nocancel -// Recvmsg_nocancel -// Sendmsg_nocancel -// Recvfrom_nocancel -// Accept_nocancel -// Fcntl_nocancel -// Select_nocancel -// Fsync_nocancel -// Connect_nocancel -// Sigsuspend_nocancel -// Readv_nocancel -// Writev_nocancel -// Sendto_nocancel -// Pread_nocancel -// Pwrite_nocancel -// Waitid_nocancel -// Poll_nocancel -// Msgsnd_nocancel -// Msgrcv_nocancel -// Sem_wait_nocancel -// Aio_suspend_nocancel -// __sigwait_nocancel -// __semwait_signal_nocancel -// __mac_mount -// __mac_get_mount -// __mac_getfsstat diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin_amd64.go b/vendor/golang.org/x/sys/unix/syscall_darwin_amd64.go index 9fa879806..0eaecf5fc 100644 --- a/vendor/golang.org/x/sys/unix/syscall_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/syscall_darwin_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && darwin -// +build amd64,darwin package unix @@ -47,6 +46,5 @@ func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, //sys getfsstat(buf unsafe.Pointer, size uintptr, flags int) (n int, err error) = SYS_GETFSSTAT64 //sys Lstat(path string, stat *Stat_t) (err error) = SYS_LSTAT64 //sys ptrace1(request int, pid int, addr uintptr, data uintptr) (err error) = SYS_ptrace -//sys ptrace1Ptr(request int, pid int, addr unsafe.Pointer, data uintptr) (err error) = SYS_ptrace //sys Stat(path string, stat *Stat_t) (err error) = SYS_STAT64 //sys Statfs(path string, stat *Statfs_t) (err error) = SYS_STATFS64 diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin_arm64.go b/vendor/golang.org/x/sys/unix/syscall_darwin_arm64.go index f17b8c526..f36c6707c 100644 --- a/vendor/golang.org/x/sys/unix/syscall_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/syscall_darwin_arm64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm64 && darwin -// +build arm64,darwin package unix @@ -47,6 +46,5 @@ func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, //sys getfsstat(buf unsafe.Pointer, size uintptr, flags int) (n int, err error) = SYS_GETFSSTAT //sys Lstat(path string, stat *Stat_t) (err error) //sys ptrace1(request int, pid int, addr uintptr, data uintptr) (err error) = SYS_ptrace -//sys ptrace1Ptr(request int, pid int, addr unsafe.Pointer, data uintptr) (err error) = SYS_ptrace //sys Stat(path string, stat *Stat_t) (err error) //sys Statfs(path string, stat *Statfs_t) (err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin_libSystem.go b/vendor/golang.org/x/sys/unix/syscall_darwin_libSystem.go index 53c96641f..2f0fa76e4 100644 --- a/vendor/golang.org/x/sys/unix/syscall_darwin_libSystem.go +++ b/vendor/golang.org/x/sys/unix/syscall_darwin_libSystem.go @@ -2,8 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build darwin && go1.12 -// +build darwin,go1.12 +//go:build darwin package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_dragonfly.go b/vendor/golang.org/x/sys/unix/syscall_dragonfly.go index d4ce988e7..97cb916f2 100644 --- a/vendor/golang.org/x/sys/unix/syscall_dragonfly.go +++ b/vendor/golang.org/x/sys/unix/syscall_dragonfly.go @@ -343,203 +343,5 @@ func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err e //sys write(fd int, p []byte) (n int, err error) //sys mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error) //sys munmap(addr uintptr, length uintptr) (err error) -//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ -//sys writelen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_WRITE //sys accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) //sys utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) - -/* - * Unimplemented - * TODO(jsing): Update this list for DragonFly. - */ -// Profil -// Sigaction -// Sigprocmask -// Getlogin -// Sigpending -// Sigaltstack -// Reboot -// Execve -// Vfork -// Sbrk -// Sstk -// Ovadvise -// Mincore -// Setitimer -// Swapon -// Select -// Sigsuspend -// Readv -// Writev -// Nfssvc -// Getfh -// Quotactl -// Mount -// Csops -// Waitid -// Add_profil -// Kdebug_trace -// Sigreturn -// Atsocket -// Kqueue_from_portset_np -// Kqueue_portset -// Getattrlist -// Setattrlist -// Getdirentriesattr -// Searchfs -// Delete -// Copyfile -// Watchevent -// Waitevent -// Modwatch -// Getxattr -// Fgetxattr -// Setxattr -// Fsetxattr -// Removexattr -// Fremovexattr -// Listxattr -// Flistxattr -// Fsctl -// Initgroups -// Posix_spawn -// Nfsclnt -// Fhopen -// Minherit -// Semsys -// Msgsys -// Shmsys -// Semctl -// Semget -// Semop -// Msgctl -// Msgget -// Msgsnd -// Msgrcv -// Shmat -// Shmctl -// Shmdt -// Shmget -// Shm_open -// Shm_unlink -// Sem_open -// Sem_close -// Sem_unlink -// Sem_wait -// Sem_trywait -// Sem_post -// Sem_getvalue -// Sem_init -// Sem_destroy -// Open_extended -// Umask_extended -// Stat_extended -// Lstat_extended -// Fstat_extended -// Chmod_extended -// Fchmod_extended -// Access_extended -// Settid -// Gettid -// Setsgroups -// Getsgroups -// Setwgroups -// Getwgroups -// Mkfifo_extended -// Mkdir_extended -// Identitysvc -// Shared_region_check_np -// Shared_region_map_np -// __pthread_mutex_destroy -// __pthread_mutex_init -// __pthread_mutex_lock -// __pthread_mutex_trylock -// __pthread_mutex_unlock -// __pthread_cond_init -// __pthread_cond_destroy -// __pthread_cond_broadcast -// __pthread_cond_signal -// Setsid_with_pid -// __pthread_cond_timedwait -// Aio_fsync -// Aio_return -// Aio_suspend -// Aio_cancel -// Aio_error -// Aio_read -// Aio_write -// Lio_listio -// __pthread_cond_wait -// Iopolicysys -// __pthread_kill -// __pthread_sigmask -// __sigwait -// __disable_threadsignal -// __pthread_markcancel -// __pthread_canceled -// __semwait_signal -// Proc_info -// Stat64_extended -// Lstat64_extended -// Fstat64_extended -// __pthread_chdir -// __pthread_fchdir -// Audit -// Auditon -// Getauid -// Setauid -// Getaudit -// Setaudit -// Getaudit_addr -// Setaudit_addr -// Auditctl -// Bsdthread_create -// Bsdthread_terminate -// Stack_snapshot -// Bsdthread_register -// Workq_open -// Workq_ops -// __mac_execve -// __mac_syscall -// __mac_get_file -// __mac_set_file -// __mac_get_link -// __mac_set_link -// __mac_get_proc -// __mac_set_proc -// __mac_get_fd -// __mac_set_fd -// __mac_get_pid -// __mac_get_lcid -// __mac_get_lctx -// __mac_set_lctx -// Setlcid -// Read_nocancel -// Write_nocancel -// Open_nocancel -// Close_nocancel -// Wait4_nocancel -// Recvmsg_nocancel -// Sendmsg_nocancel -// Recvfrom_nocancel -// Accept_nocancel -// Fcntl_nocancel -// Select_nocancel -// Fsync_nocancel -// Connect_nocancel -// Sigsuspend_nocancel -// Readv_nocancel -// Writev_nocancel -// Sendto_nocancel -// Pread_nocancel -// Pwrite_nocancel -// Waitid_nocancel -// Msgsnd_nocancel -// Msgrcv_nocancel -// Sem_wait_nocancel -// Aio_suspend_nocancel -// __sigwait_nocancel -// __semwait_signal_nocancel -// __mac_mount -// __mac_get_mount -// __mac_getfsstat diff --git a/vendor/golang.org/x/sys/unix/syscall_dragonfly_amd64.go b/vendor/golang.org/x/sys/unix/syscall_dragonfly_amd64.go index 4e2d32120..14bab6b2d 100644 --- a/vendor/golang.org/x/sys/unix/syscall_dragonfly_amd64.go +++ b/vendor/golang.org/x/sys/unix/syscall_dragonfly_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && dragonfly -// +build amd64,dragonfly package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd.go b/vendor/golang.org/x/sys/unix/syscall_freebsd.go index afb10106f..2b57e0f73 100644 --- a/vendor/golang.org/x/sys/unix/syscall_freebsd.go +++ b/vendor/golang.org/x/sys/unix/syscall_freebsd.go @@ -13,6 +13,7 @@ package unix import ( + "errors" "sync" "unsafe" ) @@ -169,25 +170,26 @@ func Getfsstat(buf []Statfs_t, flags int) (n int, err error) { func Uname(uname *Utsname) error { mib := []_C_int{CTL_KERN, KERN_OSTYPE} n := unsafe.Sizeof(uname.Sysname) - if err := sysctl(mib, &uname.Sysname[0], &n, nil, 0); err != nil { + // Suppress ENOMEM errors to be compatible with the C library __xuname() implementation. + if err := sysctl(mib, &uname.Sysname[0], &n, nil, 0); err != nil && !errors.Is(err, ENOMEM) { return err } mib = []_C_int{CTL_KERN, KERN_HOSTNAME} n = unsafe.Sizeof(uname.Nodename) - if err := sysctl(mib, &uname.Nodename[0], &n, nil, 0); err != nil { + if err := sysctl(mib, &uname.Nodename[0], &n, nil, 0); err != nil && !errors.Is(err, ENOMEM) { return err } mib = []_C_int{CTL_KERN, KERN_OSRELEASE} n = unsafe.Sizeof(uname.Release) - if err := sysctl(mib, &uname.Release[0], &n, nil, 0); err != nil { + if err := sysctl(mib, &uname.Release[0], &n, nil, 0); err != nil && !errors.Is(err, ENOMEM) { return err } mib = []_C_int{CTL_KERN, KERN_VERSION} n = unsafe.Sizeof(uname.Version) - if err := sysctl(mib, &uname.Version[0], &n, nil, 0); err != nil { + if err := sysctl(mib, &uname.Version[0], &n, nil, 0); err != nil && !errors.Is(err, ENOMEM) { return err } @@ -205,7 +207,7 @@ func Uname(uname *Utsname) error { mib = []_C_int{CTL_HW, HW_MACHINE} n = unsafe.Sizeof(uname.Machine) - if err := sysctl(mib, &uname.Machine[0], &n, nil, 0); err != nil { + if err := sysctl(mib, &uname.Machine[0], &n, nil, 0); err != nil && !errors.Is(err, ENOMEM) { return err } @@ -449,197 +451,5 @@ func Dup3(oldfd, newfd, flags int) error { //sys write(fd int, p []byte) (n int, err error) //sys mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error) //sys munmap(addr uintptr, length uintptr) (err error) -//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ -//sys writelen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_WRITE //sys accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) //sys utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) - -/* - * Unimplemented - */ -// Profil -// Sigaction -// Sigprocmask -// Getlogin -// Sigpending -// Sigaltstack -// Ioctl -// Reboot -// Execve -// Vfork -// Sbrk -// Sstk -// Ovadvise -// Mincore -// Setitimer -// Swapon -// Select -// Sigsuspend -// Readv -// Writev -// Nfssvc -// Getfh -// Quotactl -// Mount -// Csops -// Waitid -// Add_profil -// Kdebug_trace -// Sigreturn -// Atsocket -// Kqueue_from_portset_np -// Kqueue_portset -// Getattrlist -// Setattrlist -// Getdents -// Getdirentriesattr -// Searchfs -// Delete -// Copyfile -// Watchevent -// Waitevent -// Modwatch -// Fsctl -// Initgroups -// Posix_spawn -// Nfsclnt -// Fhopen -// Minherit -// Semsys -// Msgsys -// Shmsys -// Semctl -// Semget -// Semop -// Msgctl -// Msgget -// Msgsnd -// Msgrcv -// Shmat -// Shmctl -// Shmdt -// Shmget -// Shm_open -// Shm_unlink -// Sem_open -// Sem_close -// Sem_unlink -// Sem_wait -// Sem_trywait -// Sem_post -// Sem_getvalue -// Sem_init -// Sem_destroy -// Open_extended -// Umask_extended -// Stat_extended -// Lstat_extended -// Fstat_extended -// Chmod_extended -// Fchmod_extended -// Access_extended -// Settid -// Gettid -// Setsgroups -// Getsgroups -// Setwgroups -// Getwgroups -// Mkfifo_extended -// Mkdir_extended -// Identitysvc -// Shared_region_check_np -// Shared_region_map_np -// __pthread_mutex_destroy -// __pthread_mutex_init -// __pthread_mutex_lock -// __pthread_mutex_trylock -// __pthread_mutex_unlock -// __pthread_cond_init -// __pthread_cond_destroy -// __pthread_cond_broadcast -// __pthread_cond_signal -// Setsid_with_pid -// __pthread_cond_timedwait -// Aio_fsync -// Aio_return -// Aio_suspend -// Aio_cancel -// Aio_error -// Aio_read -// Aio_write -// Lio_listio -// __pthread_cond_wait -// Iopolicysys -// __pthread_kill -// __pthread_sigmask -// __sigwait -// __disable_threadsignal -// __pthread_markcancel -// __pthread_canceled -// __semwait_signal -// Proc_info -// Stat64_extended -// Lstat64_extended -// Fstat64_extended -// __pthread_chdir -// __pthread_fchdir -// Audit -// Auditon -// Getauid -// Setauid -// Getaudit -// Setaudit -// Getaudit_addr -// Setaudit_addr -// Auditctl -// Bsdthread_create -// Bsdthread_terminate -// Stack_snapshot -// Bsdthread_register -// Workq_open -// Workq_ops -// __mac_execve -// __mac_syscall -// __mac_get_file -// __mac_set_file -// __mac_get_link -// __mac_set_link -// __mac_get_proc -// __mac_set_proc -// __mac_get_fd -// __mac_set_fd -// __mac_get_pid -// __mac_get_lcid -// __mac_get_lctx -// __mac_set_lctx -// Setlcid -// Read_nocancel -// Write_nocancel -// Open_nocancel -// Close_nocancel -// Wait4_nocancel -// Recvmsg_nocancel -// Sendmsg_nocancel -// Recvfrom_nocancel -// Accept_nocancel -// Fcntl_nocancel -// Select_nocancel -// Fsync_nocancel -// Connect_nocancel -// Sigsuspend_nocancel -// Readv_nocancel -// Writev_nocancel -// Sendto_nocancel -// Pread_nocancel -// Pwrite_nocancel -// Waitid_nocancel -// Poll_nocancel -// Msgsnd_nocancel -// Msgrcv_nocancel -// Sem_wait_nocancel -// Aio_suspend_nocancel -// __sigwait_nocancel -// __semwait_signal_nocancel -// __mac_mount -// __mac_get_mount -// __mac_getfsstat diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go index b8da51004..3967bca77 100644 --- a/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build 386 && freebsd -// +build 386,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go index 47155c483..eff19ada2 100644 --- a/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && freebsd -// +build amd64,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go index 08932093f..4f24b517a 100644 --- a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm && freebsd -// +build arm,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go index d151a0d0e..ac30759ec 100644 --- a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm64 && freebsd -// +build arm64,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_riscv64.go index d5cd64b37..aab725ca7 100644 --- a/vendor/golang.org/x/sys/unix/syscall_freebsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_riscv64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build riscv64 && freebsd -// +build riscv64,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_hurd.go b/vendor/golang.org/x/sys/unix/syscall_hurd.go index 381fd4673..ba46651f8 100644 --- a/vendor/golang.org/x/sys/unix/syscall_hurd.go +++ b/vendor/golang.org/x/sys/unix/syscall_hurd.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build hurd -// +build hurd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_hurd_386.go b/vendor/golang.org/x/sys/unix/syscall_hurd_386.go index 7cf54a3e4..df89f9e6b 100644 --- a/vendor/golang.org/x/sys/unix/syscall_hurd_386.go +++ b/vendor/golang.org/x/sys/unix/syscall_hurd_386.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build 386 && hurd -// +build 386,hurd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_illumos.go b/vendor/golang.org/x/sys/unix/syscall_illumos.go index 87db5a6a8..a863f7052 100644 --- a/vendor/golang.org/x/sys/unix/syscall_illumos.go +++ b/vendor/golang.org/x/sys/unix/syscall_illumos.go @@ -5,7 +5,6 @@ // illumos system calls not present on Solaris. //go:build amd64 && illumos -// +build amd64,illumos package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux.go b/vendor/golang.org/x/sys/unix/syscall_linux.go index a730878e4..5682e2628 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux.go @@ -61,15 +61,23 @@ func FanotifyMark(fd int, flags uint, mask uint64, dirFd int, pathname string) ( } //sys fchmodat(dirfd int, path string, mode uint32) (err error) +//sys fchmodat2(dirfd int, path string, mode uint32, flags int) (err error) -func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) { - // Linux fchmodat doesn't support the flags parameter. Mimick glibc's behavior - // and check the flags. Otherwise the mode would be applied to the symlink - // destination which is not what the user expects. - if flags&^AT_SYMLINK_NOFOLLOW != 0 { - return EINVAL - } else if flags&AT_SYMLINK_NOFOLLOW != 0 { - return EOPNOTSUPP +func Fchmodat(dirfd int, path string, mode uint32, flags int) error { + // Linux fchmodat doesn't support the flags parameter, but fchmodat2 does. + // Try fchmodat2 if flags are specified. + if flags != 0 { + err := fchmodat2(dirfd, path, mode, flags) + if err == ENOSYS { + // fchmodat2 isn't available. If the flags are known to be valid, + // return EOPNOTSUPP to indicate that fchmodat doesn't support them. + if flags&^(AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH) != 0 { + return EINVAL + } else if flags&(AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH) != 0 { + return EOPNOTSUPP + } + } + return err } return fchmodat(dirfd, path, mode) } @@ -417,7 +425,8 @@ func (sa *SockaddrUnix) sockaddr() (unsafe.Pointer, _Socklen, error) { if n > 0 { sl += _Socklen(n) + 1 } - if sa.raw.Path[0] == '@' { + if sa.raw.Path[0] == '@' || (sa.raw.Path[0] == 0 && sl > 3) { + // Check sl > 3 so we don't change unnamed socket behavior. sa.raw.Path[0] = 0 // Don't count trailing NUL for abstract address. sl-- @@ -693,10 +702,10 @@ type SockaddrALG struct { func (sa *SockaddrALG) sockaddr() (unsafe.Pointer, _Socklen, error) { // Leave room for NUL byte terminator. - if len(sa.Type) > 13 { + if len(sa.Type) > len(sa.raw.Type)-1 { return nil, 0, EINVAL } - if len(sa.Name) > 63 { + if len(sa.Name) > len(sa.raw.Name)-1 { return nil, 0, EINVAL } @@ -704,17 +713,8 @@ func (sa *SockaddrALG) sockaddr() (unsafe.Pointer, _Socklen, error) { sa.raw.Feat = sa.Feature sa.raw.Mask = sa.Mask - typ, err := ByteSliceFromString(sa.Type) - if err != nil { - return nil, 0, err - } - name, err := ByteSliceFromString(sa.Name) - if err != nil { - return nil, 0, err - } - - copy(sa.raw.Type[:], typ) - copy(sa.raw.Name[:], name) + copy(sa.raw.Type[:], sa.Type) + copy(sa.raw.Name[:], sa.Name) return unsafe.Pointer(&sa.raw), SizeofSockaddrALG, nil } @@ -1310,7 +1310,7 @@ func GetsockoptString(fd, level, opt int) (string, error) { return "", err } } - return string(buf[:vallen-1]), nil + return ByteSliceToString(buf[:vallen]), nil } func GetsockoptTpacketStats(fd, level, opt int) (*TpacketStats, error) { @@ -1849,6 +1849,105 @@ func Dup2(oldfd, newfd int) error { //sys Fsmount(fd int, flags int, mountAttrs int) (fsfd int, err error) //sys Fsopen(fsName string, flags int) (fd int, err error) //sys Fspick(dirfd int, pathName string, flags int) (fd int, err error) + +//sys fsconfig(fd int, cmd uint, key *byte, value *byte, aux int) (err error) + +func fsconfigCommon(fd int, cmd uint, key string, value *byte, aux int) (err error) { + var keyp *byte + if keyp, err = BytePtrFromString(key); err != nil { + return + } + return fsconfig(fd, cmd, keyp, value, aux) +} + +// FsconfigSetFlag is equivalent to fsconfig(2) called +// with cmd == FSCONFIG_SET_FLAG. +// +// fd is the filesystem context to act upon. +// key the parameter key to set. +func FsconfigSetFlag(fd int, key string) (err error) { + return fsconfigCommon(fd, FSCONFIG_SET_FLAG, key, nil, 0) +} + +// FsconfigSetString is equivalent to fsconfig(2) called +// with cmd == FSCONFIG_SET_STRING. +// +// fd is the filesystem context to act upon. +// key the parameter key to set. +// value is the parameter value to set. +func FsconfigSetString(fd int, key string, value string) (err error) { + var valuep *byte + if valuep, err = BytePtrFromString(value); err != nil { + return + } + return fsconfigCommon(fd, FSCONFIG_SET_STRING, key, valuep, 0) +} + +// FsconfigSetBinary is equivalent to fsconfig(2) called +// with cmd == FSCONFIG_SET_BINARY. +// +// fd is the filesystem context to act upon. +// key the parameter key to set. +// value is the parameter value to set. +func FsconfigSetBinary(fd int, key string, value []byte) (err error) { + if len(value) == 0 { + return EINVAL + } + return fsconfigCommon(fd, FSCONFIG_SET_BINARY, key, &value[0], len(value)) +} + +// FsconfigSetPath is equivalent to fsconfig(2) called +// with cmd == FSCONFIG_SET_PATH. +// +// fd is the filesystem context to act upon. +// key the parameter key to set. +// path is a non-empty path for specified key. +// atfd is a file descriptor at which to start lookup from or AT_FDCWD. +func FsconfigSetPath(fd int, key string, path string, atfd int) (err error) { + var valuep *byte + if valuep, err = BytePtrFromString(path); err != nil { + return + } + return fsconfigCommon(fd, FSCONFIG_SET_PATH, key, valuep, atfd) +} + +// FsconfigSetPathEmpty is equivalent to fsconfig(2) called +// with cmd == FSCONFIG_SET_PATH_EMPTY. The same as +// FconfigSetPath but with AT_PATH_EMPTY implied. +func FsconfigSetPathEmpty(fd int, key string, path string, atfd int) (err error) { + var valuep *byte + if valuep, err = BytePtrFromString(path); err != nil { + return + } + return fsconfigCommon(fd, FSCONFIG_SET_PATH_EMPTY, key, valuep, atfd) +} + +// FsconfigSetFd is equivalent to fsconfig(2) called +// with cmd == FSCONFIG_SET_FD. +// +// fd is the filesystem context to act upon. +// key the parameter key to set. +// value is a file descriptor to be assigned to specified key. +func FsconfigSetFd(fd int, key string, value int) (err error) { + return fsconfigCommon(fd, FSCONFIG_SET_FD, key, nil, value) +} + +// FsconfigCreate is equivalent to fsconfig(2) called +// with cmd == FSCONFIG_CMD_CREATE. +// +// fd is the filesystem context to act upon. +func FsconfigCreate(fd int) (err error) { + return fsconfig(fd, FSCONFIG_CMD_CREATE, nil, nil, 0) +} + +// FsconfigReconfigure is equivalent to fsconfig(2) called +// with cmd == FSCONFIG_CMD_RECONFIGURE. +// +// fd is the filesystem context to act upon. +func FsconfigReconfigure(fd int) (err error) { + return fsconfig(fd, FSCONFIG_CMD_RECONFIGURE, nil, nil, 0) +} + //sys Getdents(fd int, buf []byte) (n int, err error) = SYS_GETDENTS64 //sysnb Getpgid(pid int) (pgid int, err error) @@ -1988,8 +2087,6 @@ func Signalfd(fd int, sigmask *Sigset_t, flags int) (newfd int, err error) { //sys Unshare(flags int) (err error) //sys write(fd int, p []byte) (n int, err error) //sys exitThread(code int) (err error) = SYS_EXIT -//sys readlen(fd int, p *byte, np int) (n int, err error) = SYS_READ -//sys writelen(fd int, p *byte, np int) (n int, err error) = SYS_WRITE //sys readv(fd int, iovs []Iovec) (n int, err error) = SYS_READV //sys writev(fd int, iovs []Iovec) (n int, err error) = SYS_WRITEV //sys preadv(fd int, iovs []Iovec, offs_l uintptr, offs_h uintptr) (n int, err error) = SYS_PREADV @@ -2471,98 +2568,27 @@ func Pselect(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timespec, sigmask * return pselect6(nfd, r, w, e, mutableTimeout, kernelMask) } -/* - * Unimplemented - */ -// AfsSyscall -// ArchPrctl -// Brk -// ClockNanosleep -// ClockSettime -// Clone -// EpollCtlOld -// EpollPwait -// EpollWaitOld -// Execve -// Fork -// Futex -// GetKernelSyms -// GetMempolicy -// GetRobustList -// GetThreadArea -// Getpmsg -// IoCancel -// IoDestroy -// IoGetevents -// IoSetup -// IoSubmit -// IoprioGet -// IoprioSet -// KexecLoad -// LookupDcookie -// Mbind -// MigratePages -// Mincore -// ModifyLdt -// Mount -// MovePages -// MqGetsetattr -// MqNotify -// MqOpen -// MqTimedreceive -// MqTimedsend -// MqUnlink -// Msgctl -// Msgget -// Msgrcv -// Msgsnd -// Nfsservctl -// Personality -// Pselect6 -// Ptrace -// Putpmsg -// Quotactl -// Readahead -// Readv -// RemapFilePages -// RestartSyscall -// RtSigaction -// RtSigpending -// RtSigqueueinfo -// RtSigreturn -// RtSigsuspend -// RtSigtimedwait -// SchedGetPriorityMax -// SchedGetPriorityMin -// SchedGetparam -// SchedGetscheduler -// SchedRrGetInterval -// SchedSetparam -// SchedYield -// Security -// Semctl -// Semget -// Semop -// Semtimedop -// SetMempolicy -// SetRobustList -// SetThreadArea -// SetTidAddress -// Sigaltstack -// Swapoff -// Swapon -// Sysfs -// TimerCreate -// TimerDelete -// TimerGetoverrun -// TimerGettime -// TimerSettime -// Tkill (obsolete) -// Tuxcall -// Umount2 -// Uselib -// Utimensat -// Vfork -// Vhangup -// Vserver -// _Sysctl +//sys schedSetattr(pid int, attr *SchedAttr, flags uint) (err error) +//sys schedGetattr(pid int, attr *SchedAttr, size uint, flags uint) (err error) + +// SchedSetAttr is a wrapper for sched_setattr(2) syscall. +// https://man7.org/linux/man-pages/man2/sched_setattr.2.html +func SchedSetAttr(pid int, attr *SchedAttr, flags uint) error { + if attr == nil { + return EINVAL + } + attr.Size = SizeofSchedAttr + return schedSetattr(pid, attr, flags) +} + +// SchedGetAttr is a wrapper for sched_getattr(2) syscall. +// https://man7.org/linux/man-pages/man2/sched_getattr.2.html +func SchedGetAttr(pid int, flags uint) (*SchedAttr, error) { + attr := &SchedAttr{} + if err := schedGetattr(pid, attr, SizeofSchedAttr, flags); err != nil { + return nil, err + } + return attr, nil +} + +//sys Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint) (err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_386.go b/vendor/golang.org/x/sys/unix/syscall_linux_386.go index c7d9945ea..506dafa7b 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_386.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_386.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build 386 && linux -// +build 386,linux package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_alarm.go b/vendor/golang.org/x/sys/unix/syscall_linux_alarm.go index 08086ac6a..38d55641b 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_alarm.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_alarm.go @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && (386 || amd64 || mips || mipsle || mips64 || mipsle || ppc64 || ppc64le || ppc || s390x || sparc64) -// +build linux -// +build 386 amd64 mips mipsle mips64 mipsle ppc64 ppc64le ppc s390x sparc64 package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go index 70601ce36..d557cf8de 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && linux -// +build amd64,linux package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_amd64_gc.go b/vendor/golang.org/x/sys/unix/syscall_linux_amd64_gc.go index 8b0f0f3aa..facdb83b2 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_amd64_gc.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_amd64_gc.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && linux && gc -// +build amd64,linux,gc package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_arm.go b/vendor/golang.org/x/sys/unix/syscall_linux_arm.go index da2986415..cd2dd797f 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_arm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm && linux -// +build arm,linux package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go index f5266689a..cf2ee6c75 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm64 && linux -// +build arm64,linux package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_gc.go b/vendor/golang.org/x/sys/unix/syscall_linux_gc.go index 2b1168d7d..ffc4c2b63 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_gc.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_gc.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && gc -// +build linux,gc package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_gc_386.go b/vendor/golang.org/x/sys/unix/syscall_linux_gc_386.go index 9843fb489..9ebfdcf44 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_gc_386.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_gc_386.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && gc && 386 -// +build linux,gc,386 package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_gc_arm.go b/vendor/golang.org/x/sys/unix/syscall_linux_gc_arm.go index a6008fccd..5f2b57c4c 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_gc_arm.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_gc_arm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm && gc && linux -// +build arm,gc,linux package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_386.go b/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_386.go index 7740af242..d1a3ad826 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_386.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_386.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && gccgo && 386 -// +build linux,gccgo,386 package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_arm.go b/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_arm.go index e16a12299..f2f67423e 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_arm.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_gccgo_arm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && gccgo && arm -// +build linux,gccgo,arm package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go b/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go index f6ab02ec1..3d0e98451 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_loong64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build loong64 && linux -// +build loong64,linux package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go b/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go index 93fe59d25..70963a95a 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && (mips64 || mips64le) -// +build linux -// +build mips64 mips64le package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go b/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go index aae7f0ffd..c218ebd28 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && (mips || mipsle) -// +build linux -// +build mips mipsle package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go b/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go index 66eff19a3..e6c48500c 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && ppc -// +build linux,ppc package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go b/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go index 806aa2574..7286a9aa8 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && (ppc64 || ppc64le) -// +build linux -// +build ppc64 ppc64le package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go index 5e6ceee12..6f5a28894 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build riscv64 && linux -// +build riscv64,linux package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go b/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go index 2f89e8f5d..66f31210d 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build s390x && linux -// +build s390x,linux package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go b/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go index 7ca064ae7..11d1f1698 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build sparc64 && linux -// +build sparc64,linux package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_netbsd.go b/vendor/golang.org/x/sys/unix/syscall_netbsd.go index ddd1ac853..88162099a 100644 --- a/vendor/golang.org/x/sys/unix/syscall_netbsd.go +++ b/vendor/golang.org/x/sys/unix/syscall_netbsd.go @@ -356,8 +356,6 @@ func Statvfs(path string, buf *Statvfs_t) (err error) { //sys write(fd int, p []byte) (n int, err error) //sys mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error) //sys munmap(addr uintptr, length uintptr) (err error) -//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ -//sys writelen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_WRITE //sys utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) const ( @@ -371,262 +369,3 @@ const ( func mremap(oldaddr uintptr, oldlength uintptr, newlength uintptr, flags int, newaddr uintptr) (uintptr, error) { return mremapNetBSD(oldaddr, oldlength, newaddr, newlength, flags) } - -/* - * Unimplemented - */ -// ____semctl13 -// __clone -// __fhopen40 -// __fhstat40 -// __fhstatvfs140 -// __fstat30 -// __getcwd -// __getfh30 -// __getlogin -// __lstat30 -// __mount50 -// __msgctl13 -// __msync13 -// __ntp_gettime30 -// __posix_chown -// __posix_fchown -// __posix_lchown -// __posix_rename -// __setlogin -// __shmctl13 -// __sigaction_sigtramp -// __sigaltstack14 -// __sigpending14 -// __sigprocmask14 -// __sigsuspend14 -// __sigtimedwait -// __stat30 -// __syscall -// __vfork14 -// _ksem_close -// _ksem_destroy -// _ksem_getvalue -// _ksem_init -// _ksem_open -// _ksem_post -// _ksem_trywait -// _ksem_unlink -// _ksem_wait -// _lwp_continue -// _lwp_create -// _lwp_ctl -// _lwp_detach -// _lwp_exit -// _lwp_getname -// _lwp_getprivate -// _lwp_kill -// _lwp_park -// _lwp_self -// _lwp_setname -// _lwp_setprivate -// _lwp_suspend -// _lwp_unpark -// _lwp_unpark_all -// _lwp_wait -// _lwp_wakeup -// _pset_bind -// _sched_getaffinity -// _sched_getparam -// _sched_setaffinity -// _sched_setparam -// acct -// aio_cancel -// aio_error -// aio_fsync -// aio_read -// aio_return -// aio_suspend -// aio_write -// break -// clock_getres -// clock_gettime -// clock_settime -// compat_09_ogetdomainname -// compat_09_osetdomainname -// compat_09_ouname -// compat_10_omsgsys -// compat_10_osemsys -// compat_10_oshmsys -// compat_12_fstat12 -// compat_12_getdirentries -// compat_12_lstat12 -// compat_12_msync -// compat_12_oreboot -// compat_12_oswapon -// compat_12_stat12 -// compat_13_sigaction13 -// compat_13_sigaltstack13 -// compat_13_sigpending13 -// compat_13_sigprocmask13 -// compat_13_sigreturn13 -// compat_13_sigsuspend13 -// compat_14___semctl -// compat_14_msgctl -// compat_14_shmctl -// compat_16___sigaction14 -// compat_16___sigreturn14 -// compat_20_fhstatfs -// compat_20_fstatfs -// compat_20_getfsstat -// compat_20_statfs -// compat_30___fhstat30 -// compat_30___fstat13 -// compat_30___lstat13 -// compat_30___stat13 -// compat_30_fhopen -// compat_30_fhstat -// compat_30_fhstatvfs1 -// compat_30_getdents -// compat_30_getfh -// compat_30_ntp_gettime -// compat_30_socket -// compat_40_mount -// compat_43_fstat43 -// compat_43_lstat43 -// compat_43_oaccept -// compat_43_ocreat -// compat_43_oftruncate -// compat_43_ogetdirentries -// compat_43_ogetdtablesize -// compat_43_ogethostid -// compat_43_ogethostname -// compat_43_ogetkerninfo -// compat_43_ogetpagesize -// compat_43_ogetpeername -// compat_43_ogetrlimit -// compat_43_ogetsockname -// compat_43_okillpg -// compat_43_olseek -// compat_43_ommap -// compat_43_oquota -// compat_43_orecv -// compat_43_orecvfrom -// compat_43_orecvmsg -// compat_43_osend -// compat_43_osendmsg -// compat_43_osethostid -// compat_43_osethostname -// compat_43_osigblock -// compat_43_osigsetmask -// compat_43_osigstack -// compat_43_osigvec -// compat_43_otruncate -// compat_43_owait -// compat_43_stat43 -// execve -// extattr_delete_fd -// extattr_delete_file -// extattr_delete_link -// extattr_get_fd -// extattr_get_file -// extattr_get_link -// extattr_list_fd -// extattr_list_file -// extattr_list_link -// extattr_set_fd -// extattr_set_file -// extattr_set_link -// extattrctl -// fchroot -// fdatasync -// fgetxattr -// fktrace -// flistxattr -// fork -// fremovexattr -// fsetxattr -// fstatvfs1 -// fsync_range -// getcontext -// getitimer -// getvfsstat -// getxattr -// ktrace -// lchflags -// lchmod -// lfs_bmapv -// lfs_markv -// lfs_segclean -// lfs_segwait -// lgetxattr -// lio_listio -// listxattr -// llistxattr -// lremovexattr -// lseek -// lsetxattr -// lutimes -// madvise -// mincore -// minherit -// modctl -// mq_close -// mq_getattr -// mq_notify -// mq_open -// mq_receive -// mq_send -// mq_setattr -// mq_timedreceive -// mq_timedsend -// mq_unlink -// msgget -// msgrcv -// msgsnd -// nfssvc -// ntp_adjtime -// pmc_control -// pmc_get_info -// pollts -// preadv -// profil -// pselect -// pset_assign -// pset_create -// pset_destroy -// ptrace -// pwritev -// quotactl -// rasctl -// readv -// reboot -// removexattr -// sa_enable -// sa_preempt -// sa_register -// sa_setconcurrency -// sa_stacks -// sa_yield -// sbrk -// sched_yield -// semconfig -// semget -// semop -// setcontext -// setitimer -// setxattr -// shmat -// shmdt -// shmget -// sstk -// statvfs1 -// swapctl -// sysarch -// syscall -// timer_create -// timer_delete -// timer_getoverrun -// timer_gettime -// timer_settime -// undelete -// utrace -// uuidgen -// vadvise -// vfork -// writev diff --git a/vendor/golang.org/x/sys/unix/syscall_netbsd_386.go b/vendor/golang.org/x/sys/unix/syscall_netbsd_386.go index 5199d282f..7a5eb5743 100644 --- a/vendor/golang.org/x/sys/unix/syscall_netbsd_386.go +++ b/vendor/golang.org/x/sys/unix/syscall_netbsd_386.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build 386 && netbsd -// +build 386,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_netbsd_amd64.go b/vendor/golang.org/x/sys/unix/syscall_netbsd_amd64.go index 70a9c52e9..62d8957ae 100644 --- a/vendor/golang.org/x/sys/unix/syscall_netbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/syscall_netbsd_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && netbsd -// +build amd64,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_netbsd_arm.go b/vendor/golang.org/x/sys/unix/syscall_netbsd_arm.go index 3eb5942f9..ce6a06885 100644 --- a/vendor/golang.org/x/sys/unix/syscall_netbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/syscall_netbsd_arm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm && netbsd -// +build arm,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_netbsd_arm64.go b/vendor/golang.org/x/sys/unix/syscall_netbsd_arm64.go index fc6ccfd81..d46d689d1 100644 --- a/vendor/golang.org/x/sys/unix/syscall_netbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/syscall_netbsd_arm64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm64 && netbsd -// +build arm64,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd.go b/vendor/golang.org/x/sys/unix/syscall_openbsd.go index c5f166a11..b25343c71 100644 --- a/vendor/golang.org/x/sys/unix/syscall_openbsd.go +++ b/vendor/golang.org/x/sys/unix/syscall_openbsd.go @@ -137,18 +137,13 @@ func sendfile(outfd int, infd int, offset *int64, count int) (written int, err e } func Getfsstat(buf []Statfs_t, flags int) (n int, err error) { - var _p0 unsafe.Pointer + var bufptr *Statfs_t var bufsize uintptr if len(buf) > 0 { - _p0 = unsafe.Pointer(&buf[0]) + bufptr = &buf[0] bufsize = unsafe.Sizeof(Statfs_t{}) * uintptr(len(buf)) } - r0, _, e1 := Syscall(SYS_GETFSSTAT, uintptr(_p0), bufsize, uintptr(flags)) - n = int(r0) - if e1 != 0 { - err = e1 - } - return + return getfsstat(bufptr, bufsize, flags) } //sysnb getresuid(ruid *_C_int, euid *_C_int, suid *_C_int) @@ -171,6 +166,20 @@ func Getresgid() (rgid, egid, sgid int) { //sys sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) = SYS___SYSCTL +//sys fcntl(fd int, cmd int, arg int) (n int, err error) +//sys fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) = SYS_FCNTL + +// FcntlInt performs a fcntl syscall on fd with the provided command and argument. +func FcntlInt(fd uintptr, cmd, arg int) (int, error) { + return fcntl(int(fd), cmd, arg) +} + +// FcntlFlock performs a fcntl syscall for the F_GETLK, F_SETLK or F_SETLKW command. +func FcntlFlock(fd uintptr, cmd int, lk *Flock_t) error { + _, err := fcntlPtr(int(fd), cmd, unsafe.Pointer(lk)) + return err +} + //sys ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) func Ppoll(fds []PollFd, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { @@ -326,78 +335,7 @@ func Uname(uname *Utsname) error { //sys write(fd int, p []byte) (n int, err error) //sys mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error) //sys munmap(addr uintptr, length uintptr) (err error) -//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ -//sys writelen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_WRITE +//sys getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) //sys utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) - -/* - * Unimplemented - */ -// __getcwd -// __semctl -// __syscall -// __sysctl -// adjfreq -// break -// clock_getres -// clock_gettime -// clock_settime -// closefrom -// execve -// fhopen -// fhstat -// fhstatfs -// fork -// futimens -// getfh -// getgid -// getitimer -// getlogin -// getthrid -// ktrace -// lfs_bmapv -// lfs_markv -// lfs_segclean -// lfs_segwait -// mincore -// minherit -// mount -// mquery -// msgctl -// msgget -// msgrcv -// msgsnd -// nfssvc -// nnpfspioctl -// preadv -// profil -// pwritev -// quotactl -// readv -// reboot -// renameat -// rfork -// sched_yield -// semget -// semop -// setgroups -// setitimer -// setsockopt -// shmat -// shmctl -// shmdt -// shmget -// sigaction -// sigaltstack -// sigpending -// sigprocmask -// sigreturn -// sigsuspend -// sysarch -// syscall -// threxit -// thrsigdivert -// thrsleep -// thrwakeup -// vfork -// writev +//sys pledge(promises *byte, execpromises *byte) (err error) +//sys unveil(path *byte, flags *byte) (err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_386.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_386.go index 6baabcdcb..9ddc89f4f 100644 --- a/vendor/golang.org/x/sys/unix/syscall_openbsd_386.go +++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_386.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build 386 && openbsd -// +build 386,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_amd64.go index bab25360e..70a3c96ee 100644 --- a/vendor/golang.org/x/sys/unix/syscall_openbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && openbsd -// +build amd64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_arm.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_arm.go index 8eed3c4d4..265caa87f 100644 --- a/vendor/golang.org/x/sys/unix/syscall_openbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_arm.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm && openbsd -// +build arm,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_arm64.go index 483dde99d..ac4fda171 100644 --- a/vendor/golang.org/x/sys/unix/syscall_openbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_arm64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build arm64 && openbsd -// +build arm64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_libc.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_libc.go index 04aa43f41..0a451e6dd 100644 --- a/vendor/golang.org/x/sys/unix/syscall_openbsd_libc.go +++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_libc.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build openbsd -// +build openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_ppc64.go index c2796139c..30a308cbb 100644 --- a/vendor/golang.org/x/sys/unix/syscall_openbsd_ppc64.go +++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_ppc64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build ppc64 && openbsd -// +build ppc64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_riscv64.go index 23199a7ff..ea954330f 100644 --- a/vendor/golang.org/x/sys/unix/syscall_openbsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_riscv64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build riscv64 && openbsd -// +build riscv64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_solaris.go b/vendor/golang.org/x/sys/unix/syscall_solaris.go index 72d23575f..21974af06 100644 --- a/vendor/golang.org/x/sys/unix/syscall_solaris.go +++ b/vendor/golang.org/x/sys/unix/syscall_solaris.go @@ -128,7 +128,8 @@ func (sa *SockaddrUnix) sockaddr() (unsafe.Pointer, _Socklen, error) { if n > 0 { sl += _Socklen(n) + 1 } - if sa.raw.Path[0] == '@' { + if sa.raw.Path[0] == '@' || (sa.raw.Path[0] == 0 && sl > 3) { + // Check sl > 3 so we don't change unnamed socket behavior. sa.raw.Path[0] = 0 // Don't count trailing NUL for abstract address. sl-- @@ -157,7 +158,7 @@ func GetsockoptString(fd, level, opt int) (string, error) { if err != nil { return "", err } - return string(buf[:vallen-1]), nil + return ByteSliceToString(buf[:vallen]), nil } const ImplementsGetwd = true @@ -698,24 +699,6 @@ func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err e //sys setsockopt(s int, level int, name int, val unsafe.Pointer, vallen uintptr) (err error) = libsocket.setsockopt //sys recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Socklen) (n int, err error) = libsocket.recvfrom -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procread)), 3, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf), 0, 0, 0) - n = int(r0) - if e1 != 0 { - err = e1 - } - return -} - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procwrite)), 3, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf), 0, 0, 0) - n = int(r0) - if e1 != 0 { - err = e1 - } - return -} - // Event Ports type fileObjCookie struct { diff --git a/vendor/golang.org/x/sys/unix/syscall_solaris_amd64.go b/vendor/golang.org/x/sys/unix/syscall_solaris_amd64.go index 0bd25ef81..e02d8ceae 100644 --- a/vendor/golang.org/x/sys/unix/syscall_solaris_amd64.go +++ b/vendor/golang.org/x/sys/unix/syscall_solaris_amd64.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build amd64 && solaris -// +build amd64,solaris package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_unix.go b/vendor/golang.org/x/sys/unix/syscall_unix.go index 8bb30e7ce..77081de8c 100644 --- a/vendor/golang.org/x/sys/unix/syscall_unix.go +++ b/vendor/golang.org/x/sys/unix/syscall_unix.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris package unix @@ -549,6 +548,9 @@ func SetNonblock(fd int, nonblocking bool) (err error) { if err != nil { return err } + if (flag&O_NONBLOCK != 0) == nonblocking { + return nil + } if nonblocking { flag |= O_NONBLOCK } else { diff --git a/vendor/golang.org/x/sys/unix/syscall_unix_gc.go b/vendor/golang.org/x/sys/unix/syscall_unix_gc.go index b6919ca58..05c95bccf 100644 --- a/vendor/golang.org/x/sys/unix/syscall_unix_gc.go +++ b/vendor/golang.org/x/sys/unix/syscall_unix_gc.go @@ -3,8 +3,6 @@ // license that can be found in the LICENSE file. //go:build (darwin || dragonfly || freebsd || (linux && !ppc64 && !ppc64le) || netbsd || openbsd || solaris) && gc -// +build darwin dragonfly freebsd linux,!ppc64,!ppc64le netbsd openbsd solaris -// +build gc package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_unix_gc_ppc64x.go b/vendor/golang.org/x/sys/unix/syscall_unix_gc_ppc64x.go index f6f707acf..23f39b7af 100644 --- a/vendor/golang.org/x/sys/unix/syscall_unix_gc_ppc64x.go +++ b/vendor/golang.org/x/sys/unix/syscall_unix_gc_ppc64x.go @@ -3,9 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux && (ppc64le || ppc64) && gc -// +build linux -// +build ppc64le ppc64 -// +build gc package unix diff --git a/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go b/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go index 44e72edb4..b473038c6 100644 --- a/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go +++ b/vendor/golang.org/x/sys/unix/syscall_zos_s390x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build zos && s390x -// +build zos,s390x package unix @@ -192,7 +191,6 @@ func (cmsg *Cmsghdr) SetLen(length int) { //sys fcntl(fd int, cmd int, arg int) (val int, err error) //sys read(fd int, p []byte) (n int, err error) -//sys readlen(fd int, buf *byte, nbuf int) (n int, err error) = SYS_READ //sys write(fd int, p []byte) (n int, err error) //sys accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) = SYS___ACCEPT_A @@ -1106,7 +1104,7 @@ func GetsockoptString(fd, level, opt int) (string, error) { return "", err } - return string(buf[:vallen-1]), nil + return ByteSliceToString(buf[:vallen]), nil } func Recvmsg(fd int, p, oob []byte, flags int) (n, oobn int, recvflags int, from Sockaddr, err error) { diff --git a/vendor/golang.org/x/sys/unix/sysvshm_linux.go b/vendor/golang.org/x/sys/unix/sysvshm_linux.go index 2c3a4437f..4fcd38de2 100644 --- a/vendor/golang.org/x/sys/unix/sysvshm_linux.go +++ b/vendor/golang.org/x/sys/unix/sysvshm_linux.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build linux -// +build linux package unix diff --git a/vendor/golang.org/x/sys/unix/sysvshm_unix.go b/vendor/golang.org/x/sys/unix/sysvshm_unix.go index 5bb41d17b..79a84f18b 100644 --- a/vendor/golang.org/x/sys/unix/sysvshm_unix.go +++ b/vendor/golang.org/x/sys/unix/sysvshm_unix.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build (darwin && !ios) || linux -// +build darwin,!ios linux package unix diff --git a/vendor/golang.org/x/sys/unix/sysvshm_unix_other.go b/vendor/golang.org/x/sys/unix/sysvshm_unix_other.go index 71bddefdb..9eb0db664 100644 --- a/vendor/golang.org/x/sys/unix/sysvshm_unix_other.go +++ b/vendor/golang.org/x/sys/unix/sysvshm_unix_other.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build darwin && !ios -// +build darwin,!ios package unix diff --git a/vendor/golang.org/x/sys/unix/timestruct.go b/vendor/golang.org/x/sys/unix/timestruct.go index 616b1b284..7997b1902 100644 --- a/vendor/golang.org/x/sys/unix/timestruct.go +++ b/vendor/golang.org/x/sys/unix/timestruct.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris || zos -// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris zos package unix diff --git a/vendor/golang.org/x/sys/unix/unveil_openbsd.go b/vendor/golang.org/x/sys/unix/unveil_openbsd.go index 168d5ae77..cb7e598ce 100644 --- a/vendor/golang.org/x/sys/unix/unveil_openbsd.go +++ b/vendor/golang.org/x/sys/unix/unveil_openbsd.go @@ -4,39 +4,48 @@ package unix -import ( - "syscall" - "unsafe" -) +import "fmt" // Unveil implements the unveil syscall. // For more information see unveil(2). // Note that the special case of blocking further // unveil calls is handled by UnveilBlock. func Unveil(path string, flags string) error { - pathPtr, err := syscall.BytePtrFromString(path) + if err := supportsUnveil(); err != nil { + return err + } + pathPtr, err := BytePtrFromString(path) if err != nil { return err } - flagsPtr, err := syscall.BytePtrFromString(flags) + flagsPtr, err := BytePtrFromString(flags) if err != nil { return err } - _, _, e := syscall.Syscall(SYS_UNVEIL, uintptr(unsafe.Pointer(pathPtr)), uintptr(unsafe.Pointer(flagsPtr)), 0) - if e != 0 { - return e - } - return nil + return unveil(pathPtr, flagsPtr) } // UnveilBlock blocks future unveil calls. // For more information see unveil(2). func UnveilBlock() error { - // Both pointers must be nil. - var pathUnsafe, flagsUnsafe unsafe.Pointer - _, _, e := syscall.Syscall(SYS_UNVEIL, uintptr(pathUnsafe), uintptr(flagsUnsafe), 0) - if e != 0 { - return e + if err := supportsUnveil(); err != nil { + return err } + return unveil(nil, nil) +} + +// supportsUnveil checks for availability of the unveil(2) system call based +// on the running OpenBSD version. +func supportsUnveil() error { + maj, min, err := majmin() + if err != nil { + return err + } + + // unveil is not available before 6.4 + if maj < 6 || (maj == 6 && min <= 3) { + return fmt.Errorf("cannot call Unveil on OpenBSD %d.%d", maj, min) + } + return nil } diff --git a/vendor/golang.org/x/sys/unix/xattr_bsd.go b/vendor/golang.org/x/sys/unix/xattr_bsd.go index f5f8e9f36..e16879396 100644 --- a/vendor/golang.org/x/sys/unix/xattr_bsd.go +++ b/vendor/golang.org/x/sys/unix/xattr_bsd.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build freebsd || netbsd -// +build freebsd netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zerrors_aix_ppc.go b/vendor/golang.org/x/sys/unix/zerrors_aix_ppc.go index ca9799b79..2fb219d78 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_aix_ppc.go +++ b/vendor/golang.org/x/sys/unix/zerrors_aix_ppc.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc && aix -// +build ppc,aix // Created by cgo -godefs - DO NOT EDIT // cgo -godefs -- -maix32 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_aix_ppc64.go b/vendor/golang.org/x/sys/unix/zerrors_aix_ppc64.go index 200c8c26f..b0e6f5c85 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_aix_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_aix_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64 && aix -// +build ppc64,aix // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -maix64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go index 143007627..e40fa8524 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && darwin -// +build amd64,darwin // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go index ab044a742..bb02aa6c0 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && darwin -// +build arm64,darwin // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_dragonfly_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_dragonfly_amd64.go index 17bba0e44..c0e0f8694 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_dragonfly_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_dragonfly_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && dragonfly -// +build amd64,dragonfly // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go index f8c2c5138..6c6923906 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && freebsd -// +build 386,freebsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m32 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go index 96310c3be..dd9163f8e 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && freebsd -// +build amd64,freebsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go index 777b69def..493a2a793 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && freebsd -// +build arm,freebsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go index c557ac2db..8b437b307 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && freebsd -// +build arm64,freebsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_riscv64.go index 341b4d962..67c02dd57 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && freebsd -// +build riscv64,freebsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux.go b/vendor/golang.org/x/sys/unix/zerrors_linux.go index 3784f402e..36bf8399f 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux.go @@ -1,7 +1,6 @@ // Code generated by mkmerge; DO NOT EDIT. //go:build linux -// +build linux package unix @@ -481,10 +480,13 @@ const ( BPF_FROM_BE = 0x8 BPF_FROM_LE = 0x0 BPF_FS_MAGIC = 0xcafe4a11 + BPF_F_AFTER = 0x10 BPF_F_ALLOW_MULTI = 0x2 BPF_F_ALLOW_OVERRIDE = 0x1 BPF_F_ANY_ALIGNMENT = 0x2 - BPF_F_KPROBE_MULTI_RETURN = 0x1 + BPF_F_BEFORE = 0x8 + BPF_F_ID = 0x20 + BPF_F_NETFILTER_IP_DEFRAG = 0x1 BPF_F_QUERY_EFFECTIVE = 0x1 BPF_F_REPLACE = 0x4 BPF_F_SLEEPABLE = 0x10 @@ -521,6 +523,7 @@ const ( BPF_MAJOR_VERSION = 0x1 BPF_MAXINSNS = 0x1000 BPF_MEM = 0x60 + BPF_MEMSX = 0x80 BPF_MEMWORDS = 0x10 BPF_MINOR_VERSION = 0x1 BPF_MISC = 0x7 @@ -776,6 +779,8 @@ const ( DEVLINK_GENL_MCGRP_CONFIG_NAME = "config" DEVLINK_GENL_NAME = "devlink" DEVLINK_GENL_VERSION = 0x1 + DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO = 0x4 + DEVLINK_PORT_FN_CAP_IPSEC_PACKET = 0x8 DEVLINK_PORT_FN_CAP_MIGRATABLE = 0x2 DEVLINK_PORT_FN_CAP_ROCE = 0x1 DEVLINK_SB_THRESHOLD_TO_ALPHA_MAX = 0x14 @@ -1698,6 +1703,7 @@ const ( KEXEC_ON_CRASH = 0x1 KEXEC_PRESERVE_CONTEXT = 0x2 KEXEC_SEGMENT_MAX = 0x10 + KEXEC_UPDATE_ELFCOREHDR = 0x4 KEYCTL_ASSUME_AUTHORITY = 0x10 KEYCTL_CAPABILITIES = 0x1f KEYCTL_CAPS0_BIG_KEY = 0x10 @@ -1779,6 +1785,8 @@ const ( LANDLOCK_ACCESS_FS_REMOVE_FILE = 0x20 LANDLOCK_ACCESS_FS_TRUNCATE = 0x4000 LANDLOCK_ACCESS_FS_WRITE_FILE = 0x2 + LANDLOCK_ACCESS_NET_BIND_TCP = 0x1 + LANDLOCK_ACCESS_NET_CONNECT_TCP = 0x2 LANDLOCK_CREATE_RULESET_VERSION = 0x1 LINUX_REBOOT_CMD_CAD_OFF = 0x0 LINUX_REBOOT_CMD_CAD_ON = 0x89abcdef @@ -1795,6 +1803,7 @@ const ( LOCK_SH = 0x1 LOCK_UN = 0x8 LOOP_CLR_FD = 0x4c01 + LOOP_CONFIGURE = 0x4c0a LOOP_CTL_ADD = 0x4c80 LOOP_CTL_GET_FREE = 0x4c82 LOOP_CTL_REMOVE = 0x4c81 @@ -2120,6 +2129,60 @@ const ( NFNL_SUBSYS_QUEUE = 0x3 NFNL_SUBSYS_ULOG = 0x4 NFS_SUPER_MAGIC = 0x6969 + NFT_CHAIN_FLAGS = 0x7 + NFT_CHAIN_MAXNAMELEN = 0x100 + NFT_CT_MAX = 0x17 + NFT_DATA_RESERVED_MASK = 0xffffff00 + NFT_DATA_VALUE_MAXLEN = 0x40 + NFT_EXTHDR_OP_MAX = 0x4 + NFT_FIB_RESULT_MAX = 0x3 + NFT_INNER_MASK = 0xf + NFT_LOGLEVEL_MAX = 0x8 + NFT_NAME_MAXLEN = 0x100 + NFT_NG_MAX = 0x1 + NFT_OBJECT_CONNLIMIT = 0x5 + NFT_OBJECT_COUNTER = 0x1 + NFT_OBJECT_CT_EXPECT = 0x9 + NFT_OBJECT_CT_HELPER = 0x3 + NFT_OBJECT_CT_TIMEOUT = 0x7 + NFT_OBJECT_LIMIT = 0x4 + NFT_OBJECT_MAX = 0xa + NFT_OBJECT_QUOTA = 0x2 + NFT_OBJECT_SECMARK = 0x8 + NFT_OBJECT_SYNPROXY = 0xa + NFT_OBJECT_TUNNEL = 0x6 + NFT_OBJECT_UNSPEC = 0x0 + NFT_OBJ_MAXNAMELEN = 0x100 + NFT_OSF_MAXGENRELEN = 0x10 + NFT_QUEUE_FLAG_BYPASS = 0x1 + NFT_QUEUE_FLAG_CPU_FANOUT = 0x2 + NFT_QUEUE_FLAG_MASK = 0x3 + NFT_REG32_COUNT = 0x10 + NFT_REG32_SIZE = 0x4 + NFT_REG_MAX = 0x4 + NFT_REG_SIZE = 0x10 + NFT_REJECT_ICMPX_MAX = 0x3 + NFT_RT_MAX = 0x4 + NFT_SECMARK_CTX_MAXLEN = 0x100 + NFT_SET_MAXNAMELEN = 0x100 + NFT_SOCKET_MAX = 0x3 + NFT_TABLE_F_MASK = 0x3 + NFT_TABLE_MAXNAMELEN = 0x100 + NFT_TRACETYPE_MAX = 0x3 + NFT_TUNNEL_F_MASK = 0x7 + NFT_TUNNEL_MAX = 0x1 + NFT_TUNNEL_MODE_MAX = 0x2 + NFT_USERDATA_MAXLEN = 0x100 + NFT_XFRM_KEY_MAX = 0x6 + NF_NAT_RANGE_MAP_IPS = 0x1 + NF_NAT_RANGE_MASK = 0x7f + NF_NAT_RANGE_NETMAP = 0x40 + NF_NAT_RANGE_PERSISTENT = 0x8 + NF_NAT_RANGE_PROTO_OFFSET = 0x20 + NF_NAT_RANGE_PROTO_RANDOM = 0x4 + NF_NAT_RANGE_PROTO_RANDOM_ALL = 0x14 + NF_NAT_RANGE_PROTO_RANDOM_FULLY = 0x10 + NF_NAT_RANGE_PROTO_SPECIFIED = 0x2 NILFS_SUPER_MAGIC = 0x3434 NL0 = 0x0 NL1 = 0x100 @@ -2275,6 +2338,7 @@ const ( PERF_MEM_LVLNUM_PMEM = 0xe PERF_MEM_LVLNUM_RAM = 0xd PERF_MEM_LVLNUM_SHIFT = 0x21 + PERF_MEM_LVLNUM_UNC = 0x8 PERF_MEM_LVL_HIT = 0x2 PERF_MEM_LVL_IO = 0x1000 PERF_MEM_LVL_L1 = 0x8 @@ -2403,6 +2467,7 @@ const ( PR_MCE_KILL_GET = 0x22 PR_MCE_KILL_LATE = 0x0 PR_MCE_KILL_SET = 0x1 + PR_MDWE_NO_INHERIT = 0x2 PR_MDWE_REFUSE_EXEC_GAIN = 0x1 PR_MPX_DISABLE_MANAGEMENT = 0x2c PR_MPX_ENABLE_MANAGEMENT = 0x2b @@ -2421,6 +2486,15 @@ const ( PR_PAC_GET_ENABLED_KEYS = 0x3d PR_PAC_RESET_KEYS = 0x36 PR_PAC_SET_ENABLED_KEYS = 0x3c + PR_RISCV_V_GET_CONTROL = 0x46 + PR_RISCV_V_SET_CONTROL = 0x45 + PR_RISCV_V_VSTATE_CTRL_CUR_MASK = 0x3 + PR_RISCV_V_VSTATE_CTRL_DEFAULT = 0x0 + PR_RISCV_V_VSTATE_CTRL_INHERIT = 0x10 + PR_RISCV_V_VSTATE_CTRL_MASK = 0x1f + PR_RISCV_V_VSTATE_CTRL_NEXT_MASK = 0xc + PR_RISCV_V_VSTATE_CTRL_OFF = 0x1 + PR_RISCV_V_VSTATE_CTRL_ON = 0x2 PR_SCHED_CORE = 0x3e PR_SCHED_CORE_CREATE = 0x1 PR_SCHED_CORE_GET = 0x0 @@ -2598,8 +2672,9 @@ const ( RTAX_FEATURES = 0xc RTAX_FEATURE_ALLFRAG = 0x8 RTAX_FEATURE_ECN = 0x1 - RTAX_FEATURE_MASK = 0xf + RTAX_FEATURE_MASK = 0x1f RTAX_FEATURE_SACK = 0x2 + RTAX_FEATURE_TCP_USEC_TS = 0x10 RTAX_FEATURE_TIMESTAMP = 0x4 RTAX_HOPLIMIT = 0xa RTAX_INITCWND = 0xb @@ -2821,13 +2896,59 @@ const ( RWF_SUPPORTED = 0x1f RWF_SYNC = 0x4 RWF_WRITE_LIFE_NOT_SET = 0x0 + SCHED_BATCH = 0x3 + SCHED_DEADLINE = 0x6 + SCHED_FIFO = 0x1 + SCHED_FLAG_ALL = 0x7f + SCHED_FLAG_DL_OVERRUN = 0x4 + SCHED_FLAG_KEEP_ALL = 0x18 + SCHED_FLAG_KEEP_PARAMS = 0x10 + SCHED_FLAG_KEEP_POLICY = 0x8 + SCHED_FLAG_RECLAIM = 0x2 + SCHED_FLAG_RESET_ON_FORK = 0x1 + SCHED_FLAG_UTIL_CLAMP = 0x60 + SCHED_FLAG_UTIL_CLAMP_MAX = 0x40 + SCHED_FLAG_UTIL_CLAMP_MIN = 0x20 + SCHED_IDLE = 0x5 + SCHED_NORMAL = 0x0 + SCHED_RESET_ON_FORK = 0x40000000 + SCHED_RR = 0x2 SCM_CREDENTIALS = 0x2 SCM_RIGHTS = 0x1 SCM_TIMESTAMP = 0x1d SC_LOG_FLUSH = 0x100000 + SECCOMP_ADDFD_FLAG_SEND = 0x2 + SECCOMP_ADDFD_FLAG_SETFD = 0x1 + SECCOMP_FILTER_FLAG_LOG = 0x2 + SECCOMP_FILTER_FLAG_NEW_LISTENER = 0x8 + SECCOMP_FILTER_FLAG_SPEC_ALLOW = 0x4 + SECCOMP_FILTER_FLAG_TSYNC = 0x1 + SECCOMP_FILTER_FLAG_TSYNC_ESRCH = 0x10 + SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV = 0x20 + SECCOMP_GET_ACTION_AVAIL = 0x2 + SECCOMP_GET_NOTIF_SIZES = 0x3 + SECCOMP_IOCTL_NOTIF_RECV = 0xc0502100 + SECCOMP_IOCTL_NOTIF_SEND = 0xc0182101 + SECCOMP_IOC_MAGIC = '!' SECCOMP_MODE_DISABLED = 0x0 SECCOMP_MODE_FILTER = 0x2 SECCOMP_MODE_STRICT = 0x1 + SECCOMP_RET_ACTION = 0x7fff0000 + SECCOMP_RET_ACTION_FULL = 0xffff0000 + SECCOMP_RET_ALLOW = 0x7fff0000 + SECCOMP_RET_DATA = 0xffff + SECCOMP_RET_ERRNO = 0x50000 + SECCOMP_RET_KILL = 0x0 + SECCOMP_RET_KILL_PROCESS = 0x80000000 + SECCOMP_RET_KILL_THREAD = 0x0 + SECCOMP_RET_LOG = 0x7ffc0000 + SECCOMP_RET_TRACE = 0x7ff00000 + SECCOMP_RET_TRAP = 0x30000 + SECCOMP_RET_USER_NOTIF = 0x7fc00000 + SECCOMP_SET_MODE_FILTER = 0x1 + SECCOMP_SET_MODE_STRICT = 0x0 + SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP = 0x1 + SECCOMP_USER_NOTIF_FLAG_CONTINUE = 0x1 SECRETMEM_MAGIC = 0x5345434d SECURITYFS_MAGIC = 0x73636673 SEEK_CUR = 0x1 @@ -2987,6 +3108,7 @@ const ( SOL_TIPC = 0x10f SOL_TLS = 0x11a SOL_UDP = 0x11 + SOL_VSOCK = 0x11f SOL_X25 = 0x106 SOL_XDP = 0x11b SOMAXCONN = 0x1000 @@ -3435,6 +3557,7 @@ const ( XDP_PACKET_HEADROOM = 0x100 XDP_PGOFF_RX_RING = 0x0 XDP_PGOFF_TX_RING = 0x80000000 + XDP_PKT_CONTD = 0x1 XDP_RING_NEED_WAKEUP = 0x1 XDP_RX_RING = 0x2 XDP_SHARED_UMEM = 0x1 @@ -3447,6 +3570,7 @@ const ( XDP_UMEM_REG = 0x4 XDP_UMEM_UNALIGNED_CHUNK_FLAG = 0x1 XDP_USE_NEED_WAKEUP = 0x8 + XDP_USE_SG = 0x10 XDP_ZEROCOPY = 0x4 XENFS_SUPER_MAGIC = 0xabba1974 XFS_SUPER_MAGIC = 0x58465342 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go index cfb143001..42ff8c3c1 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && linux -// +build 386,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/386/include -m32 _const.go @@ -282,6 +281,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x800 SIOCATMARK = 0x8905 @@ -326,10 +328,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0xa SO_PASSCRED = 0x10 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x11 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1f SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x26 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go index df64f2d59..dca436004 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && linux -// +build amd64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/amd64/include -m64 _const.go @@ -283,6 +282,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x800 SIOCATMARK = 0x8905 @@ -327,10 +329,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0xa SO_PASSCRED = 0x10 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x11 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1f SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x26 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go index 3025cd5b2..5cca668ac 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && linux -// +build arm,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/arm/include _const.go @@ -289,6 +288,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x800 SIOCATMARK = 0x8905 @@ -333,10 +335,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0xa SO_PASSCRED = 0x10 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x11 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1f SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x26 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go index 09e1ffbef..d8cae6d15 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && linux -// +build arm64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/arm64/include -fsigned-char _const.go @@ -279,6 +278,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x800 SIOCATMARK = 0x8905 @@ -323,10 +325,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0xa SO_PASSCRED = 0x10 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x11 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1f SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x26 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go index a45723540..28e39afdc 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_loong64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build loong64 && linux -// +build loong64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/loong64/include _const.go @@ -118,6 +117,9 @@ const ( IUCLC = 0x200 IXOFF = 0x1000 IXON = 0x400 + LASX_CTX_MAGIC = 0x41535801 + LBT_CTX_MAGIC = 0x42540001 + LSX_CTX_MAGIC = 0x53580001 MAP_ANON = 0x20 MAP_ANONYMOUS = 0x20 MAP_DENYWRITE = 0x800 @@ -273,6 +275,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x800 SIOCATMARK = 0x8905 @@ -317,10 +322,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0xa SO_PASSCRED = 0x10 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x11 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1f SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x26 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go index fee7dfb81..cd66e92cb 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips && linux -// +build mips,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/mips/include _const.go @@ -282,6 +281,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x80 SIOCATMARK = 0x40047307 @@ -326,10 +328,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0x100 SO_PASSCRED = 0x11 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x12 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1e SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x1028 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go index a5b2373ae..c1595eba7 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64 && linux -// +build mips64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/mips64/include _const.go @@ -282,6 +281,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x80 SIOCATMARK = 0x40047307 @@ -326,10 +328,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0x100 SO_PASSCRED = 0x11 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x12 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1e SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x1028 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go index 5dde82c98..ee9456b0d 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64le && linux -// +build mips64le,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/mips64le/include _const.go @@ -282,6 +281,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x80 SIOCATMARK = 0x40047307 @@ -326,10 +328,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0x100 SO_PASSCRED = 0x11 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x12 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1e SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x1028 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go index 2e80ea6b3..8cfca81e1 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mipsle && linux -// +build mipsle,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/mipsle/include _const.go @@ -282,6 +281,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x80 SIOCATMARK = 0x40047307 @@ -326,10 +328,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0x100 SO_PASSCRED = 0x11 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x12 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1e SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x1028 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go index a65dcd7cb..60b0deb3a 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc && linux -// +build ppc,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/ppc/include _const.go @@ -337,6 +336,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x800 SIOCATMARK = 0x8905 @@ -381,10 +383,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0xa SO_PASSCRED = 0x14 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x15 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1f SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x26 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go index cbd34e3d8..f90aa7281 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64 && linux -// +build ppc64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/ppc64/include _const.go @@ -341,6 +340,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x800 SIOCATMARK = 0x8905 @@ -385,10 +387,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0xa SO_PASSCRED = 0x14 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x15 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1f SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x26 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go index e4afa7a31..ba9e01503 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64le && linux -// +build ppc64le,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/ppc64le/include _const.go @@ -341,6 +340,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x800 SIOCATMARK = 0x8905 @@ -385,10 +387,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0xa SO_PASSCRED = 0x14 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x15 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1f SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x26 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go index 44f45a039..07cdfd6e9 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && linux -// +build riscv64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/riscv64/include _const.go @@ -228,6 +227,9 @@ const ( PPPIOCUNBRIDGECHAN = 0x7434 PPPIOCXFERUNIT = 0x744e PR_SET_PTRACER_ANY = 0xffffffffffffffff + PTRACE_GETFDPIC = 0x21 + PTRACE_GETFDPIC_EXEC = 0x0 + PTRACE_GETFDPIC_INTERP = 0x1 RLIMIT_AS = 0x9 RLIMIT_MEMLOCK = 0x8 RLIMIT_NOFILE = 0x7 @@ -270,6 +272,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x800 SIOCATMARK = 0x8905 @@ -314,10 +319,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0xa SO_PASSCRED = 0x10 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x11 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1f SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x26 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go index 74733e260..2f1dd214a 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build s390x && linux -// +build s390x,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/s390x/include -fsigned-char _const.go @@ -345,6 +344,9 @@ const ( SCM_TIMESTAMPNS = 0x23 SCM_TXTIME = 0x3d SCM_WIFI_STATUS = 0x29 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x40182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x40082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x40082104 SFD_CLOEXEC = 0x80000 SFD_NONBLOCK = 0x800 SIOCATMARK = 0x8905 @@ -389,10 +391,12 @@ const ( SO_NOFCS = 0x2b SO_OOBINLINE = 0xa SO_PASSCRED = 0x10 + SO_PASSPIDFD = 0x4c SO_PASSSEC = 0x22 SO_PEEK_OFF = 0x2a SO_PEERCRED = 0x11 SO_PEERGROUPS = 0x3b + SO_PEERPIDFD = 0x4d SO_PEERSEC = 0x1f SO_PREFER_BUSY_POLL = 0x45 SO_PROTOCOL = 0x26 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go index f5f3934b1..f40519d90 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build sparc64 && linux -// +build sparc64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -Wall -Werror -static -I/tmp/sparc64/include _const.go @@ -336,6 +335,9 @@ const ( SCM_TIMESTAMPNS = 0x21 SCM_TXTIME = 0x3f SCM_WIFI_STATUS = 0x25 + SECCOMP_IOCTL_NOTIF_ADDFD = 0x80182103 + SECCOMP_IOCTL_NOTIF_ID_VALID = 0x80082102 + SECCOMP_IOCTL_NOTIF_SET_FLAGS = 0x80082104 SFD_CLOEXEC = 0x400000 SFD_NONBLOCK = 0x4000 SF_FP = 0x38 @@ -428,10 +430,12 @@ const ( SO_NOFCS = 0x27 SO_OOBINLINE = 0x100 SO_PASSCRED = 0x2 + SO_PASSPIDFD = 0x55 SO_PASSSEC = 0x1f SO_PEEK_OFF = 0x26 SO_PEERCRED = 0x40 SO_PEERGROUPS = 0x3d + SO_PEERPIDFD = 0x56 SO_PEERSEC = 0x1e SO_PREFER_BUSY_POLL = 0x48 SO_PROTOCOL = 0x1028 diff --git a/vendor/golang.org/x/sys/unix/zerrors_netbsd_386.go b/vendor/golang.org/x/sys/unix/zerrors_netbsd_386.go index 72f7420d2..130085df4 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_netbsd_386.go +++ b/vendor/golang.org/x/sys/unix/zerrors_netbsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && netbsd -// +build 386,netbsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m32 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_netbsd_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_netbsd_amd64.go index 8d4eb0c08..84769a1a3 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_netbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_netbsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && netbsd -// +build amd64,netbsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm.go b/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm.go index 9eef9749f..602ded003 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && netbsd -// +build arm,netbsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -marm _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm64.go index 3b62ba192..efc0406ee 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_netbsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && netbsd -// +build arm64,netbsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_386.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_386.go index af20e474b..5a6500f83 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_386.go +++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && openbsd -// +build 386,openbsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m32 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_amd64.go index 6015fcb2b..a5aeeb979 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && openbsd -// +build amd64,openbsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm.go index 8d44955e4..0e9748a72 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && openbsd -// +build arm,openbsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm64.go index ae16fe754..4f4449abc 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && openbsd -// +build arm64,openbsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_mips64.go index 03d90fe35..76a363f0f 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_mips64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_mips64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64 && openbsd -// +build mips64,openbsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_ppc64.go index 8e2c51b1e..43ca0cdfd 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64 && openbsd -// +build ppc64,openbsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/zerrors_openbsd_riscv64.go index 13d403031..b1b8bb200 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_openbsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_openbsd_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && openbsd -// +build riscv64,openbsd // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_solaris_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_solaris_amd64.go index 1afee6a08..d2ddd3176 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_solaris_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_solaris_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && solaris -// +build amd64,solaris // Code generated by cmd/cgo -godefs; DO NOT EDIT. // cgo -godefs -- -m64 _const.go diff --git a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go index fc7d0506f..4dfd2e051 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go +++ b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build zos && s390x -// +build zos,s390x // Hand edited based on zerrors_linux_s390x.go // TODO: auto-generate. diff --git a/vendor/golang.org/x/sys/unix/zptrace_armnn_linux.go b/vendor/golang.org/x/sys/unix/zptrace_armnn_linux.go index 97f20ca28..586317c78 100644 --- a/vendor/golang.org/x/sys/unix/zptrace_armnn_linux.go +++ b/vendor/golang.org/x/sys/unix/zptrace_armnn_linux.go @@ -1,8 +1,6 @@ // Code generated by linux/mkall.go generatePtracePair("arm", "arm64"). DO NOT EDIT. //go:build linux && (arm || arm64) -// +build linux -// +build arm arm64 package unix diff --git a/vendor/golang.org/x/sys/unix/zptrace_mipsnn_linux.go b/vendor/golang.org/x/sys/unix/zptrace_mipsnn_linux.go index 0b5f79430..d7c881be7 100644 --- a/vendor/golang.org/x/sys/unix/zptrace_mipsnn_linux.go +++ b/vendor/golang.org/x/sys/unix/zptrace_mipsnn_linux.go @@ -1,8 +1,6 @@ // Code generated by linux/mkall.go generatePtracePair("mips", "mips64"). DO NOT EDIT. //go:build linux && (mips || mips64) -// +build linux -// +build mips mips64 package unix diff --git a/vendor/golang.org/x/sys/unix/zptrace_mipsnnle_linux.go b/vendor/golang.org/x/sys/unix/zptrace_mipsnnle_linux.go index 2807f7e64..2d2de5d29 100644 --- a/vendor/golang.org/x/sys/unix/zptrace_mipsnnle_linux.go +++ b/vendor/golang.org/x/sys/unix/zptrace_mipsnnle_linux.go @@ -1,8 +1,6 @@ // Code generated by linux/mkall.go generatePtracePair("mipsle", "mips64le"). DO NOT EDIT. //go:build linux && (mipsle || mips64le) -// +build linux -// +build mipsle mips64le package unix diff --git a/vendor/golang.org/x/sys/unix/zptrace_x86_linux.go b/vendor/golang.org/x/sys/unix/zptrace_x86_linux.go index 281ea64e3..5adc79fb5 100644 --- a/vendor/golang.org/x/sys/unix/zptrace_x86_linux.go +++ b/vendor/golang.org/x/sys/unix/zptrace_x86_linux.go @@ -1,8 +1,6 @@ // Code generated by linux/mkall.go generatePtracePair("386", "amd64"). DO NOT EDIT. //go:build linux && (386 || amd64) -// +build linux -// +build 386 amd64 package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc.go b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc.go index 9a257219d..6ea64a3c0 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build aix && ppc -// +build aix,ppc package unix @@ -817,28 +816,6 @@ func write(fd int, p []byte) (n int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, p *byte, np int) (n int, err error) { - r0, er := C.read(C.int(fd), C.uintptr_t(uintptr(unsafe.Pointer(p))), C.size_t(np)) - n = int(r0) - if r0 == -1 && er != nil { - err = er - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, p *byte, np int) (n int, err error) { - r0, er := C.write(C.int(fd), C.uintptr_t(uintptr(unsafe.Pointer(p))), C.size_t(np)) - n = int(r0) - if r0 == -1 && er != nil { - err = er - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func Dup2(oldfd int, newfd int) (err error) { r0, er := C.dup2(C.int(oldfd), C.int(newfd)) if r0 == -1 && er != nil { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64.go b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64.go index 6de80c20c..99ee4399a 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build aix && ppc64 -// +build aix,ppc64 package unix @@ -762,28 +761,6 @@ func write(fd int, p []byte) (n int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, p *byte, np int) (n int, err error) { - r0, e1 := callread(fd, uintptr(unsafe.Pointer(p)), np) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, p *byte, np int) (n int, err error) { - r0, e1 := callwrite(fd, uintptr(unsafe.Pointer(p)), np) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func Dup2(oldfd int, newfd int) (err error) { _, e1 := calldup2(oldfd, newfd) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gc.go b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gc.go index c4d50ae50..b68a78362 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gc.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gc.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build aix && ppc64 && gc -// +build aix,ppc64,gc package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gccgo.go b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gccgo.go index 6903d3b09..0a87450bf 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gccgo.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_aix_ppc64_gccgo.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build aix && ppc64 && gccgo -// +build aix,ppc64,gccgo package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go index 4037ccf7a..ccb02f240 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build darwin && amd64 -// +build darwin,amd64 package unix @@ -725,6 +724,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) { return } +var libc_ioctl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_ioctl ioctl "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { @@ -733,10 +738,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "/usr/lib/libSystem.B.dylib" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -2410,28 +2411,6 @@ var libc_munmap_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func Fstat(fd int, stat *Stat_t) (err error) { _, _, e1 := syscall_syscall(libc_fstat64_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { @@ -2521,14 +2500,6 @@ func ptrace1(request int, pid int, addr uintptr, data uintptr) (err error) { return } -func ptrace1Ptr(request int, pid int, addr uintptr, data unsafe.Pointer) (err error) { - _, _, e1 := syscall_syscall6(libc_ptrace_trampoline_addr, uintptr(request), uintptr(pid), addr, uintptr(data), 0, 0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - var libc_ptrace_trampoline_addr uintptr //go:cgo_import_dynamic libc_ptrace ptrace "/usr/lib/libSystem.B.dylib" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s index 4baaed0bc..8b8bb2840 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s @@ -5,703 +5,586 @@ TEXT libc_fdopendir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fdopendir(SB) - GLOBL ·libc_fdopendir_trampoline_addr(SB), RODATA, $8 DATA ·libc_fdopendir_trampoline_addr(SB)/8, $libc_fdopendir_trampoline<>(SB) TEXT libc_getgroups_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getgroups(SB) - GLOBL ·libc_getgroups_trampoline_addr(SB), RODATA, $8 DATA ·libc_getgroups_trampoline_addr(SB)/8, $libc_getgroups_trampoline<>(SB) TEXT libc_setgroups_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setgroups(SB) - GLOBL ·libc_setgroups_trampoline_addr(SB), RODATA, $8 DATA ·libc_setgroups_trampoline_addr(SB)/8, $libc_setgroups_trampoline<>(SB) TEXT libc_wait4_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_wait4(SB) - GLOBL ·libc_wait4_trampoline_addr(SB), RODATA, $8 DATA ·libc_wait4_trampoline_addr(SB)/8, $libc_wait4_trampoline<>(SB) TEXT libc_accept_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_accept(SB) - GLOBL ·libc_accept_trampoline_addr(SB), RODATA, $8 DATA ·libc_accept_trampoline_addr(SB)/8, $libc_accept_trampoline<>(SB) TEXT libc_bind_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_bind(SB) - GLOBL ·libc_bind_trampoline_addr(SB), RODATA, $8 DATA ·libc_bind_trampoline_addr(SB)/8, $libc_bind_trampoline<>(SB) TEXT libc_connect_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_connect(SB) - GLOBL ·libc_connect_trampoline_addr(SB), RODATA, $8 DATA ·libc_connect_trampoline_addr(SB)/8, $libc_connect_trampoline<>(SB) TEXT libc_socket_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_socket(SB) - GLOBL ·libc_socket_trampoline_addr(SB), RODATA, $8 DATA ·libc_socket_trampoline_addr(SB)/8, $libc_socket_trampoline<>(SB) TEXT libc_getsockopt_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getsockopt(SB) - GLOBL ·libc_getsockopt_trampoline_addr(SB), RODATA, $8 DATA ·libc_getsockopt_trampoline_addr(SB)/8, $libc_getsockopt_trampoline<>(SB) TEXT libc_setsockopt_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setsockopt(SB) - GLOBL ·libc_setsockopt_trampoline_addr(SB), RODATA, $8 DATA ·libc_setsockopt_trampoline_addr(SB)/8, $libc_setsockopt_trampoline<>(SB) TEXT libc_getpeername_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getpeername(SB) - GLOBL ·libc_getpeername_trampoline_addr(SB), RODATA, $8 DATA ·libc_getpeername_trampoline_addr(SB)/8, $libc_getpeername_trampoline<>(SB) TEXT libc_getsockname_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getsockname(SB) - GLOBL ·libc_getsockname_trampoline_addr(SB), RODATA, $8 DATA ·libc_getsockname_trampoline_addr(SB)/8, $libc_getsockname_trampoline<>(SB) TEXT libc_shutdown_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_shutdown(SB) - GLOBL ·libc_shutdown_trampoline_addr(SB), RODATA, $8 DATA ·libc_shutdown_trampoline_addr(SB)/8, $libc_shutdown_trampoline<>(SB) TEXT libc_socketpair_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_socketpair(SB) - GLOBL ·libc_socketpair_trampoline_addr(SB), RODATA, $8 DATA ·libc_socketpair_trampoline_addr(SB)/8, $libc_socketpair_trampoline<>(SB) TEXT libc_recvfrom_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_recvfrom(SB) - GLOBL ·libc_recvfrom_trampoline_addr(SB), RODATA, $8 DATA ·libc_recvfrom_trampoline_addr(SB)/8, $libc_recvfrom_trampoline<>(SB) TEXT libc_sendto_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendto(SB) - GLOBL ·libc_sendto_trampoline_addr(SB), RODATA, $8 DATA ·libc_sendto_trampoline_addr(SB)/8, $libc_sendto_trampoline<>(SB) TEXT libc_recvmsg_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_recvmsg(SB) - GLOBL ·libc_recvmsg_trampoline_addr(SB), RODATA, $8 DATA ·libc_recvmsg_trampoline_addr(SB)/8, $libc_recvmsg_trampoline<>(SB) TEXT libc_sendmsg_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendmsg(SB) - GLOBL ·libc_sendmsg_trampoline_addr(SB), RODATA, $8 DATA ·libc_sendmsg_trampoline_addr(SB)/8, $libc_sendmsg_trampoline<>(SB) TEXT libc_kevent_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_kevent(SB) - GLOBL ·libc_kevent_trampoline_addr(SB), RODATA, $8 DATA ·libc_kevent_trampoline_addr(SB)/8, $libc_kevent_trampoline<>(SB) TEXT libc_utimes_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_utimes(SB) - GLOBL ·libc_utimes_trampoline_addr(SB), RODATA, $8 DATA ·libc_utimes_trampoline_addr(SB)/8, $libc_utimes_trampoline<>(SB) TEXT libc_futimes_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_futimes(SB) - GLOBL ·libc_futimes_trampoline_addr(SB), RODATA, $8 DATA ·libc_futimes_trampoline_addr(SB)/8, $libc_futimes_trampoline<>(SB) TEXT libc_poll_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_poll(SB) - GLOBL ·libc_poll_trampoline_addr(SB), RODATA, $8 DATA ·libc_poll_trampoline_addr(SB)/8, $libc_poll_trampoline<>(SB) TEXT libc_madvise_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_madvise(SB) - GLOBL ·libc_madvise_trampoline_addr(SB), RODATA, $8 DATA ·libc_madvise_trampoline_addr(SB)/8, $libc_madvise_trampoline<>(SB) TEXT libc_mlock_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mlock(SB) - GLOBL ·libc_mlock_trampoline_addr(SB), RODATA, $8 DATA ·libc_mlock_trampoline_addr(SB)/8, $libc_mlock_trampoline<>(SB) TEXT libc_mlockall_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mlockall(SB) - GLOBL ·libc_mlockall_trampoline_addr(SB), RODATA, $8 DATA ·libc_mlockall_trampoline_addr(SB)/8, $libc_mlockall_trampoline<>(SB) TEXT libc_mprotect_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mprotect(SB) - GLOBL ·libc_mprotect_trampoline_addr(SB), RODATA, $8 DATA ·libc_mprotect_trampoline_addr(SB)/8, $libc_mprotect_trampoline<>(SB) TEXT libc_msync_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_msync(SB) - GLOBL ·libc_msync_trampoline_addr(SB), RODATA, $8 DATA ·libc_msync_trampoline_addr(SB)/8, $libc_msync_trampoline<>(SB) TEXT libc_munlock_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_munlock(SB) - GLOBL ·libc_munlock_trampoline_addr(SB), RODATA, $8 DATA ·libc_munlock_trampoline_addr(SB)/8, $libc_munlock_trampoline<>(SB) TEXT libc_munlockall_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_munlockall(SB) - GLOBL ·libc_munlockall_trampoline_addr(SB), RODATA, $8 DATA ·libc_munlockall_trampoline_addr(SB)/8, $libc_munlockall_trampoline<>(SB) TEXT libc_closedir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_closedir(SB) - GLOBL ·libc_closedir_trampoline_addr(SB), RODATA, $8 DATA ·libc_closedir_trampoline_addr(SB)/8, $libc_closedir_trampoline<>(SB) TEXT libc_readdir_r_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_readdir_r(SB) - GLOBL ·libc_readdir_r_trampoline_addr(SB), RODATA, $8 DATA ·libc_readdir_r_trampoline_addr(SB)/8, $libc_readdir_r_trampoline<>(SB) TEXT libc_pipe_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_pipe(SB) - GLOBL ·libc_pipe_trampoline_addr(SB), RODATA, $8 DATA ·libc_pipe_trampoline_addr(SB)/8, $libc_pipe_trampoline<>(SB) TEXT libc_getxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getxattr(SB) - GLOBL ·libc_getxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_getxattr_trampoline_addr(SB)/8, $libc_getxattr_trampoline<>(SB) TEXT libc_fgetxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fgetxattr(SB) - GLOBL ·libc_fgetxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_fgetxattr_trampoline_addr(SB)/8, $libc_fgetxattr_trampoline<>(SB) TEXT libc_setxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setxattr(SB) - GLOBL ·libc_setxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_setxattr_trampoline_addr(SB)/8, $libc_setxattr_trampoline<>(SB) TEXT libc_fsetxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fsetxattr(SB) - GLOBL ·libc_fsetxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_fsetxattr_trampoline_addr(SB)/8, $libc_fsetxattr_trampoline<>(SB) TEXT libc_removexattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_removexattr(SB) - GLOBL ·libc_removexattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_removexattr_trampoline_addr(SB)/8, $libc_removexattr_trampoline<>(SB) TEXT libc_fremovexattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fremovexattr(SB) - GLOBL ·libc_fremovexattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_fremovexattr_trampoline_addr(SB)/8, $libc_fremovexattr_trampoline<>(SB) TEXT libc_listxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_listxattr(SB) - GLOBL ·libc_listxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_listxattr_trampoline_addr(SB)/8, $libc_listxattr_trampoline<>(SB) TEXT libc_flistxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_flistxattr(SB) - GLOBL ·libc_flistxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_flistxattr_trampoline_addr(SB)/8, $libc_flistxattr_trampoline<>(SB) TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_utimensat(SB) - GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8 DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB) TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fcntl(SB) - GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8 DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB) TEXT libc_kill_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_kill(SB) - GLOBL ·libc_kill_trampoline_addr(SB), RODATA, $8 DATA ·libc_kill_trampoline_addr(SB)/8, $libc_kill_trampoline<>(SB) TEXT libc_ioctl_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ioctl(SB) - GLOBL ·libc_ioctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_ioctl_trampoline_addr(SB)/8, $libc_ioctl_trampoline<>(SB) TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sysctl(SB) - GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendfile(SB) - GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8 DATA ·libc_sendfile_trampoline_addr(SB)/8, $libc_sendfile_trampoline<>(SB) TEXT libc_shmat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_shmat(SB) - GLOBL ·libc_shmat_trampoline_addr(SB), RODATA, $8 DATA ·libc_shmat_trampoline_addr(SB)/8, $libc_shmat_trampoline<>(SB) TEXT libc_shmctl_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_shmctl(SB) - GLOBL ·libc_shmctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_shmctl_trampoline_addr(SB)/8, $libc_shmctl_trampoline<>(SB) TEXT libc_shmdt_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_shmdt(SB) - GLOBL ·libc_shmdt_trampoline_addr(SB), RODATA, $8 DATA ·libc_shmdt_trampoline_addr(SB)/8, $libc_shmdt_trampoline<>(SB) TEXT libc_shmget_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_shmget(SB) - GLOBL ·libc_shmget_trampoline_addr(SB), RODATA, $8 DATA ·libc_shmget_trampoline_addr(SB)/8, $libc_shmget_trampoline<>(SB) TEXT libc_access_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_access(SB) - GLOBL ·libc_access_trampoline_addr(SB), RODATA, $8 DATA ·libc_access_trampoline_addr(SB)/8, $libc_access_trampoline<>(SB) TEXT libc_adjtime_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_adjtime(SB) - GLOBL ·libc_adjtime_trampoline_addr(SB), RODATA, $8 DATA ·libc_adjtime_trampoline_addr(SB)/8, $libc_adjtime_trampoline<>(SB) TEXT libc_chdir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_chdir(SB) - GLOBL ·libc_chdir_trampoline_addr(SB), RODATA, $8 DATA ·libc_chdir_trampoline_addr(SB)/8, $libc_chdir_trampoline<>(SB) TEXT libc_chflags_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_chflags(SB) - GLOBL ·libc_chflags_trampoline_addr(SB), RODATA, $8 DATA ·libc_chflags_trampoline_addr(SB)/8, $libc_chflags_trampoline<>(SB) TEXT libc_chmod_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_chmod(SB) - GLOBL ·libc_chmod_trampoline_addr(SB), RODATA, $8 DATA ·libc_chmod_trampoline_addr(SB)/8, $libc_chmod_trampoline<>(SB) TEXT libc_chown_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_chown(SB) - GLOBL ·libc_chown_trampoline_addr(SB), RODATA, $8 DATA ·libc_chown_trampoline_addr(SB)/8, $libc_chown_trampoline<>(SB) TEXT libc_chroot_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_chroot(SB) - GLOBL ·libc_chroot_trampoline_addr(SB), RODATA, $8 DATA ·libc_chroot_trampoline_addr(SB)/8, $libc_chroot_trampoline<>(SB) TEXT libc_clock_gettime_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_clock_gettime(SB) - GLOBL ·libc_clock_gettime_trampoline_addr(SB), RODATA, $8 DATA ·libc_clock_gettime_trampoline_addr(SB)/8, $libc_clock_gettime_trampoline<>(SB) TEXT libc_close_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_close(SB) - GLOBL ·libc_close_trampoline_addr(SB), RODATA, $8 DATA ·libc_close_trampoline_addr(SB)/8, $libc_close_trampoline<>(SB) TEXT libc_clonefile_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_clonefile(SB) - GLOBL ·libc_clonefile_trampoline_addr(SB), RODATA, $8 DATA ·libc_clonefile_trampoline_addr(SB)/8, $libc_clonefile_trampoline<>(SB) TEXT libc_clonefileat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_clonefileat(SB) - GLOBL ·libc_clonefileat_trampoline_addr(SB), RODATA, $8 DATA ·libc_clonefileat_trampoline_addr(SB)/8, $libc_clonefileat_trampoline<>(SB) TEXT libc_dup_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_dup(SB) - GLOBL ·libc_dup_trampoline_addr(SB), RODATA, $8 DATA ·libc_dup_trampoline_addr(SB)/8, $libc_dup_trampoline<>(SB) TEXT libc_dup2_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_dup2(SB) - GLOBL ·libc_dup2_trampoline_addr(SB), RODATA, $8 DATA ·libc_dup2_trampoline_addr(SB)/8, $libc_dup2_trampoline<>(SB) TEXT libc_exchangedata_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_exchangedata(SB) - GLOBL ·libc_exchangedata_trampoline_addr(SB), RODATA, $8 DATA ·libc_exchangedata_trampoline_addr(SB)/8, $libc_exchangedata_trampoline<>(SB) TEXT libc_exit_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_exit(SB) - GLOBL ·libc_exit_trampoline_addr(SB), RODATA, $8 DATA ·libc_exit_trampoline_addr(SB)/8, $libc_exit_trampoline<>(SB) TEXT libc_faccessat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_faccessat(SB) - GLOBL ·libc_faccessat_trampoline_addr(SB), RODATA, $8 DATA ·libc_faccessat_trampoline_addr(SB)/8, $libc_faccessat_trampoline<>(SB) TEXT libc_fchdir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchdir(SB) - GLOBL ·libc_fchdir_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchdir_trampoline_addr(SB)/8, $libc_fchdir_trampoline<>(SB) TEXT libc_fchflags_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchflags(SB) - GLOBL ·libc_fchflags_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchflags_trampoline_addr(SB)/8, $libc_fchflags_trampoline<>(SB) TEXT libc_fchmod_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchmod(SB) - GLOBL ·libc_fchmod_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchmod_trampoline_addr(SB)/8, $libc_fchmod_trampoline<>(SB) TEXT libc_fchmodat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchmodat(SB) - GLOBL ·libc_fchmodat_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchmodat_trampoline_addr(SB)/8, $libc_fchmodat_trampoline<>(SB) TEXT libc_fchown_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchown(SB) - GLOBL ·libc_fchown_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchown_trampoline_addr(SB)/8, $libc_fchown_trampoline<>(SB) TEXT libc_fchownat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchownat(SB) - GLOBL ·libc_fchownat_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchownat_trampoline_addr(SB)/8, $libc_fchownat_trampoline<>(SB) TEXT libc_fclonefileat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fclonefileat(SB) - GLOBL ·libc_fclonefileat_trampoline_addr(SB), RODATA, $8 DATA ·libc_fclonefileat_trampoline_addr(SB)/8, $libc_fclonefileat_trampoline<>(SB) TEXT libc_flock_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_flock(SB) - GLOBL ·libc_flock_trampoline_addr(SB), RODATA, $8 DATA ·libc_flock_trampoline_addr(SB)/8, $libc_flock_trampoline<>(SB) TEXT libc_fpathconf_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fpathconf(SB) - GLOBL ·libc_fpathconf_trampoline_addr(SB), RODATA, $8 DATA ·libc_fpathconf_trampoline_addr(SB)/8, $libc_fpathconf_trampoline<>(SB) TEXT libc_fsync_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fsync(SB) - GLOBL ·libc_fsync_trampoline_addr(SB), RODATA, $8 DATA ·libc_fsync_trampoline_addr(SB)/8, $libc_fsync_trampoline<>(SB) TEXT libc_ftruncate_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ftruncate(SB) - GLOBL ·libc_ftruncate_trampoline_addr(SB), RODATA, $8 DATA ·libc_ftruncate_trampoline_addr(SB)/8, $libc_ftruncate_trampoline<>(SB) TEXT libc_getcwd_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getcwd(SB) - GLOBL ·libc_getcwd_trampoline_addr(SB), RODATA, $8 DATA ·libc_getcwd_trampoline_addr(SB)/8, $libc_getcwd_trampoline<>(SB) TEXT libc_getdtablesize_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getdtablesize(SB) - GLOBL ·libc_getdtablesize_trampoline_addr(SB), RODATA, $8 DATA ·libc_getdtablesize_trampoline_addr(SB)/8, $libc_getdtablesize_trampoline<>(SB) TEXT libc_getegid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getegid(SB) - GLOBL ·libc_getegid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getegid_trampoline_addr(SB)/8, $libc_getegid_trampoline<>(SB) TEXT libc_geteuid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_geteuid(SB) - GLOBL ·libc_geteuid_trampoline_addr(SB), RODATA, $8 DATA ·libc_geteuid_trampoline_addr(SB)/8, $libc_geteuid_trampoline<>(SB) TEXT libc_getgid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getgid(SB) - GLOBL ·libc_getgid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getgid_trampoline_addr(SB)/8, $libc_getgid_trampoline<>(SB) TEXT libc_getpgid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getpgid(SB) - GLOBL ·libc_getpgid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getpgid_trampoline_addr(SB)/8, $libc_getpgid_trampoline<>(SB) TEXT libc_getpgrp_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getpgrp(SB) - GLOBL ·libc_getpgrp_trampoline_addr(SB), RODATA, $8 DATA ·libc_getpgrp_trampoline_addr(SB)/8, $libc_getpgrp_trampoline<>(SB) TEXT libc_getpid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getpid(SB) - GLOBL ·libc_getpid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getpid_trampoline_addr(SB)/8, $libc_getpid_trampoline<>(SB) TEXT libc_getppid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getppid(SB) - GLOBL ·libc_getppid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getppid_trampoline_addr(SB)/8, $libc_getppid_trampoline<>(SB) TEXT libc_getpriority_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getpriority(SB) - GLOBL ·libc_getpriority_trampoline_addr(SB), RODATA, $8 DATA ·libc_getpriority_trampoline_addr(SB)/8, $libc_getpriority_trampoline<>(SB) TEXT libc_getrlimit_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getrlimit(SB) - GLOBL ·libc_getrlimit_trampoline_addr(SB), RODATA, $8 DATA ·libc_getrlimit_trampoline_addr(SB)/8, $libc_getrlimit_trampoline<>(SB) TEXT libc_getrusage_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getrusage(SB) - GLOBL ·libc_getrusage_trampoline_addr(SB), RODATA, $8 DATA ·libc_getrusage_trampoline_addr(SB)/8, $libc_getrusage_trampoline<>(SB) TEXT libc_getsid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getsid(SB) - GLOBL ·libc_getsid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getsid_trampoline_addr(SB)/8, $libc_getsid_trampoline<>(SB) TEXT libc_gettimeofday_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_gettimeofday(SB) - GLOBL ·libc_gettimeofday_trampoline_addr(SB), RODATA, $8 DATA ·libc_gettimeofday_trampoline_addr(SB)/8, $libc_gettimeofday_trampoline<>(SB) TEXT libc_getuid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getuid(SB) - GLOBL ·libc_getuid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getuid_trampoline_addr(SB)/8, $libc_getuid_trampoline<>(SB) TEXT libc_issetugid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_issetugid(SB) - GLOBL ·libc_issetugid_trampoline_addr(SB), RODATA, $8 DATA ·libc_issetugid_trampoline_addr(SB)/8, $libc_issetugid_trampoline<>(SB) TEXT libc_kqueue_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_kqueue(SB) - GLOBL ·libc_kqueue_trampoline_addr(SB), RODATA, $8 DATA ·libc_kqueue_trampoline_addr(SB)/8, $libc_kqueue_trampoline<>(SB) TEXT libc_lchown_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_lchown(SB) - GLOBL ·libc_lchown_trampoline_addr(SB), RODATA, $8 DATA ·libc_lchown_trampoline_addr(SB)/8, $libc_lchown_trampoline<>(SB) TEXT libc_link_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_link(SB) - GLOBL ·libc_link_trampoline_addr(SB), RODATA, $8 DATA ·libc_link_trampoline_addr(SB)/8, $libc_link_trampoline<>(SB) TEXT libc_linkat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_linkat(SB) - GLOBL ·libc_linkat_trampoline_addr(SB), RODATA, $8 DATA ·libc_linkat_trampoline_addr(SB)/8, $libc_linkat_trampoline<>(SB) TEXT libc_listen_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_listen(SB) - GLOBL ·libc_listen_trampoline_addr(SB), RODATA, $8 DATA ·libc_listen_trampoline_addr(SB)/8, $libc_listen_trampoline<>(SB) TEXT libc_mkdir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mkdir(SB) - GLOBL ·libc_mkdir_trampoline_addr(SB), RODATA, $8 DATA ·libc_mkdir_trampoline_addr(SB)/8, $libc_mkdir_trampoline<>(SB) TEXT libc_mkdirat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mkdirat(SB) - GLOBL ·libc_mkdirat_trampoline_addr(SB), RODATA, $8 DATA ·libc_mkdirat_trampoline_addr(SB)/8, $libc_mkdirat_trampoline<>(SB) TEXT libc_mkfifo_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mkfifo(SB) - GLOBL ·libc_mkfifo_trampoline_addr(SB), RODATA, $8 DATA ·libc_mkfifo_trampoline_addr(SB)/8, $libc_mkfifo_trampoline<>(SB) TEXT libc_mknod_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mknod(SB) - GLOBL ·libc_mknod_trampoline_addr(SB), RODATA, $8 DATA ·libc_mknod_trampoline_addr(SB)/8, $libc_mknod_trampoline<>(SB) TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mount(SB) - GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $8 DATA ·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB) TEXT libc_open_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_open(SB) - GLOBL ·libc_open_trampoline_addr(SB), RODATA, $8 DATA ·libc_open_trampoline_addr(SB)/8, $libc_open_trampoline<>(SB) TEXT libc_openat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_openat(SB) - GLOBL ·libc_openat_trampoline_addr(SB), RODATA, $8 DATA ·libc_openat_trampoline_addr(SB)/8, $libc_openat_trampoline<>(SB) TEXT libc_pathconf_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_pathconf(SB) - GLOBL ·libc_pathconf_trampoline_addr(SB), RODATA, $8 DATA ·libc_pathconf_trampoline_addr(SB)/8, $libc_pathconf_trampoline<>(SB) TEXT libc_pread_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_pread(SB) - GLOBL ·libc_pread_trampoline_addr(SB), RODATA, $8 DATA ·libc_pread_trampoline_addr(SB)/8, $libc_pread_trampoline<>(SB) TEXT libc_pwrite_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_pwrite(SB) - GLOBL ·libc_pwrite_trampoline_addr(SB), RODATA, $8 DATA ·libc_pwrite_trampoline_addr(SB)/8, $libc_pwrite_trampoline<>(SB) TEXT libc_read_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_read(SB) - GLOBL ·libc_read_trampoline_addr(SB), RODATA, $8 DATA ·libc_read_trampoline_addr(SB)/8, $libc_read_trampoline<>(SB) TEXT libc_readlink_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_readlink(SB) - GLOBL ·libc_readlink_trampoline_addr(SB), RODATA, $8 DATA ·libc_readlink_trampoline_addr(SB)/8, $libc_readlink_trampoline<>(SB) TEXT libc_readlinkat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_readlinkat(SB) - GLOBL ·libc_readlinkat_trampoline_addr(SB), RODATA, $8 DATA ·libc_readlinkat_trampoline_addr(SB)/8, $libc_readlinkat_trampoline<>(SB) TEXT libc_rename_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_rename(SB) - GLOBL ·libc_rename_trampoline_addr(SB), RODATA, $8 DATA ·libc_rename_trampoline_addr(SB)/8, $libc_rename_trampoline<>(SB) TEXT libc_renameat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_renameat(SB) - GLOBL ·libc_renameat_trampoline_addr(SB), RODATA, $8 DATA ·libc_renameat_trampoline_addr(SB)/8, $libc_renameat_trampoline<>(SB) TEXT libc_revoke_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_revoke(SB) - GLOBL ·libc_revoke_trampoline_addr(SB), RODATA, $8 DATA ·libc_revoke_trampoline_addr(SB)/8, $libc_revoke_trampoline<>(SB) TEXT libc_rmdir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_rmdir(SB) - GLOBL ·libc_rmdir_trampoline_addr(SB), RODATA, $8 DATA ·libc_rmdir_trampoline_addr(SB)/8, $libc_rmdir_trampoline<>(SB) TEXT libc_lseek_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_lseek(SB) - GLOBL ·libc_lseek_trampoline_addr(SB), RODATA, $8 DATA ·libc_lseek_trampoline_addr(SB)/8, $libc_lseek_trampoline<>(SB) TEXT libc_select_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_select(SB) - GLOBL ·libc_select_trampoline_addr(SB), RODATA, $8 DATA ·libc_select_trampoline_addr(SB)/8, $libc_select_trampoline<>(SB) @@ -712,192 +595,160 @@ DATA ·libc_setattrlist_trampoline_addr(SB)/8, $libc_setattrlist_trampoline<>(SB TEXT libc_setegid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setegid(SB) - GLOBL ·libc_setegid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setegid_trampoline_addr(SB)/8, $libc_setegid_trampoline<>(SB) TEXT libc_seteuid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_seteuid(SB) - GLOBL ·libc_seteuid_trampoline_addr(SB), RODATA, $8 DATA ·libc_seteuid_trampoline_addr(SB)/8, $libc_seteuid_trampoline<>(SB) TEXT libc_setgid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setgid(SB) - GLOBL ·libc_setgid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setgid_trampoline_addr(SB)/8, $libc_setgid_trampoline<>(SB) TEXT libc_setlogin_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setlogin(SB) - GLOBL ·libc_setlogin_trampoline_addr(SB), RODATA, $8 DATA ·libc_setlogin_trampoline_addr(SB)/8, $libc_setlogin_trampoline<>(SB) TEXT libc_setpgid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setpgid(SB) - GLOBL ·libc_setpgid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setpgid_trampoline_addr(SB)/8, $libc_setpgid_trampoline<>(SB) TEXT libc_setpriority_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setpriority(SB) - GLOBL ·libc_setpriority_trampoline_addr(SB), RODATA, $8 DATA ·libc_setpriority_trampoline_addr(SB)/8, $libc_setpriority_trampoline<>(SB) TEXT libc_setprivexec_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setprivexec(SB) - GLOBL ·libc_setprivexec_trampoline_addr(SB), RODATA, $8 DATA ·libc_setprivexec_trampoline_addr(SB)/8, $libc_setprivexec_trampoline<>(SB) TEXT libc_setregid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setregid(SB) - GLOBL ·libc_setregid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setregid_trampoline_addr(SB)/8, $libc_setregid_trampoline<>(SB) TEXT libc_setreuid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setreuid(SB) - GLOBL ·libc_setreuid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setreuid_trampoline_addr(SB)/8, $libc_setreuid_trampoline<>(SB) TEXT libc_setsid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setsid(SB) - GLOBL ·libc_setsid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setsid_trampoline_addr(SB)/8, $libc_setsid_trampoline<>(SB) TEXT libc_settimeofday_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_settimeofday(SB) - GLOBL ·libc_settimeofday_trampoline_addr(SB), RODATA, $8 DATA ·libc_settimeofday_trampoline_addr(SB)/8, $libc_settimeofday_trampoline<>(SB) TEXT libc_setuid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setuid(SB) - GLOBL ·libc_setuid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setuid_trampoline_addr(SB)/8, $libc_setuid_trampoline<>(SB) TEXT libc_symlink_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_symlink(SB) - GLOBL ·libc_symlink_trampoline_addr(SB), RODATA, $8 DATA ·libc_symlink_trampoline_addr(SB)/8, $libc_symlink_trampoline<>(SB) TEXT libc_symlinkat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_symlinkat(SB) - GLOBL ·libc_symlinkat_trampoline_addr(SB), RODATA, $8 DATA ·libc_symlinkat_trampoline_addr(SB)/8, $libc_symlinkat_trampoline<>(SB) TEXT libc_sync_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sync(SB) - GLOBL ·libc_sync_trampoline_addr(SB), RODATA, $8 DATA ·libc_sync_trampoline_addr(SB)/8, $libc_sync_trampoline<>(SB) TEXT libc_truncate_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_truncate(SB) - GLOBL ·libc_truncate_trampoline_addr(SB), RODATA, $8 DATA ·libc_truncate_trampoline_addr(SB)/8, $libc_truncate_trampoline<>(SB) TEXT libc_umask_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_umask(SB) - GLOBL ·libc_umask_trampoline_addr(SB), RODATA, $8 DATA ·libc_umask_trampoline_addr(SB)/8, $libc_umask_trampoline<>(SB) TEXT libc_undelete_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_undelete(SB) - GLOBL ·libc_undelete_trampoline_addr(SB), RODATA, $8 DATA ·libc_undelete_trampoline_addr(SB)/8, $libc_undelete_trampoline<>(SB) TEXT libc_unlink_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_unlink(SB) - GLOBL ·libc_unlink_trampoline_addr(SB), RODATA, $8 DATA ·libc_unlink_trampoline_addr(SB)/8, $libc_unlink_trampoline<>(SB) TEXT libc_unlinkat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_unlinkat(SB) - GLOBL ·libc_unlinkat_trampoline_addr(SB), RODATA, $8 DATA ·libc_unlinkat_trampoline_addr(SB)/8, $libc_unlinkat_trampoline<>(SB) TEXT libc_unmount_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_unmount(SB) - GLOBL ·libc_unmount_trampoline_addr(SB), RODATA, $8 DATA ·libc_unmount_trampoline_addr(SB)/8, $libc_unmount_trampoline<>(SB) TEXT libc_write_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_write(SB) - GLOBL ·libc_write_trampoline_addr(SB), RODATA, $8 DATA ·libc_write_trampoline_addr(SB)/8, $libc_write_trampoline<>(SB) TEXT libc_mmap_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mmap(SB) - GLOBL ·libc_mmap_trampoline_addr(SB), RODATA, $8 DATA ·libc_mmap_trampoline_addr(SB)/8, $libc_mmap_trampoline<>(SB) TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_munmap(SB) - GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8 DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB) TEXT libc_fstat64_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fstat64(SB) - GLOBL ·libc_fstat64_trampoline_addr(SB), RODATA, $8 DATA ·libc_fstat64_trampoline_addr(SB)/8, $libc_fstat64_trampoline<>(SB) TEXT libc_fstatat64_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fstatat64(SB) - GLOBL ·libc_fstatat64_trampoline_addr(SB), RODATA, $8 DATA ·libc_fstatat64_trampoline_addr(SB)/8, $libc_fstatat64_trampoline<>(SB) TEXT libc_fstatfs64_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fstatfs64(SB) - GLOBL ·libc_fstatfs64_trampoline_addr(SB), RODATA, $8 DATA ·libc_fstatfs64_trampoline_addr(SB)/8, $libc_fstatfs64_trampoline<>(SB) TEXT libc_getfsstat64_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getfsstat64(SB) - GLOBL ·libc_getfsstat64_trampoline_addr(SB), RODATA, $8 DATA ·libc_getfsstat64_trampoline_addr(SB)/8, $libc_getfsstat64_trampoline<>(SB) TEXT libc_lstat64_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_lstat64(SB) - GLOBL ·libc_lstat64_trampoline_addr(SB), RODATA, $8 DATA ·libc_lstat64_trampoline_addr(SB)/8, $libc_lstat64_trampoline<>(SB) TEXT libc_ptrace_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ptrace(SB) - GLOBL ·libc_ptrace_trampoline_addr(SB), RODATA, $8 DATA ·libc_ptrace_trampoline_addr(SB)/8, $libc_ptrace_trampoline<>(SB) TEXT libc_stat64_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_stat64(SB) - GLOBL ·libc_stat64_trampoline_addr(SB), RODATA, $8 DATA ·libc_stat64_trampoline_addr(SB)/8, $libc_stat64_trampoline<>(SB) TEXT libc_statfs64_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_statfs64(SB) - GLOBL ·libc_statfs64_trampoline_addr(SB), RODATA, $8 DATA ·libc_statfs64_trampoline_addr(SB)/8, $libc_statfs64_trampoline<>(SB) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go index 51d6f3fb2..1b40b997b 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build darwin && arm64 -// +build darwin,arm64 package unix @@ -725,6 +724,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) { return } +var libc_ioctl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_ioctl ioctl "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { @@ -733,10 +738,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "/usr/lib/libSystem.B.dylib" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -2410,28 +2411,6 @@ var libc_munmap_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func Fstat(fd int, stat *Stat_t) (err error) { _, _, e1 := syscall_syscall(libc_fstat_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { @@ -2521,14 +2500,6 @@ func ptrace1(request int, pid int, addr uintptr, data uintptr) (err error) { return } -func ptrace1Ptr(request int, pid int, addr uintptr, data unsafe.Pointer) (err error) { - _, _, e1 := syscall_syscall6(libc_ptrace_trampoline_addr, uintptr(request), uintptr(pid), addr, uintptr(data), 0, 0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - var libc_ptrace_trampoline_addr uintptr //go:cgo_import_dynamic libc_ptrace ptrace "/usr/lib/libSystem.B.dylib" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s index c3b82c037..08362c1ab 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s @@ -5,703 +5,586 @@ TEXT libc_fdopendir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fdopendir(SB) - GLOBL ·libc_fdopendir_trampoline_addr(SB), RODATA, $8 DATA ·libc_fdopendir_trampoline_addr(SB)/8, $libc_fdopendir_trampoline<>(SB) TEXT libc_getgroups_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getgroups(SB) - GLOBL ·libc_getgroups_trampoline_addr(SB), RODATA, $8 DATA ·libc_getgroups_trampoline_addr(SB)/8, $libc_getgroups_trampoline<>(SB) TEXT libc_setgroups_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setgroups(SB) - GLOBL ·libc_setgroups_trampoline_addr(SB), RODATA, $8 DATA ·libc_setgroups_trampoline_addr(SB)/8, $libc_setgroups_trampoline<>(SB) TEXT libc_wait4_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_wait4(SB) - GLOBL ·libc_wait4_trampoline_addr(SB), RODATA, $8 DATA ·libc_wait4_trampoline_addr(SB)/8, $libc_wait4_trampoline<>(SB) TEXT libc_accept_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_accept(SB) - GLOBL ·libc_accept_trampoline_addr(SB), RODATA, $8 DATA ·libc_accept_trampoline_addr(SB)/8, $libc_accept_trampoline<>(SB) TEXT libc_bind_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_bind(SB) - GLOBL ·libc_bind_trampoline_addr(SB), RODATA, $8 DATA ·libc_bind_trampoline_addr(SB)/8, $libc_bind_trampoline<>(SB) TEXT libc_connect_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_connect(SB) - GLOBL ·libc_connect_trampoline_addr(SB), RODATA, $8 DATA ·libc_connect_trampoline_addr(SB)/8, $libc_connect_trampoline<>(SB) TEXT libc_socket_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_socket(SB) - GLOBL ·libc_socket_trampoline_addr(SB), RODATA, $8 DATA ·libc_socket_trampoline_addr(SB)/8, $libc_socket_trampoline<>(SB) TEXT libc_getsockopt_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getsockopt(SB) - GLOBL ·libc_getsockopt_trampoline_addr(SB), RODATA, $8 DATA ·libc_getsockopt_trampoline_addr(SB)/8, $libc_getsockopt_trampoline<>(SB) TEXT libc_setsockopt_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setsockopt(SB) - GLOBL ·libc_setsockopt_trampoline_addr(SB), RODATA, $8 DATA ·libc_setsockopt_trampoline_addr(SB)/8, $libc_setsockopt_trampoline<>(SB) TEXT libc_getpeername_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getpeername(SB) - GLOBL ·libc_getpeername_trampoline_addr(SB), RODATA, $8 DATA ·libc_getpeername_trampoline_addr(SB)/8, $libc_getpeername_trampoline<>(SB) TEXT libc_getsockname_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getsockname(SB) - GLOBL ·libc_getsockname_trampoline_addr(SB), RODATA, $8 DATA ·libc_getsockname_trampoline_addr(SB)/8, $libc_getsockname_trampoline<>(SB) TEXT libc_shutdown_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_shutdown(SB) - GLOBL ·libc_shutdown_trampoline_addr(SB), RODATA, $8 DATA ·libc_shutdown_trampoline_addr(SB)/8, $libc_shutdown_trampoline<>(SB) TEXT libc_socketpair_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_socketpair(SB) - GLOBL ·libc_socketpair_trampoline_addr(SB), RODATA, $8 DATA ·libc_socketpair_trampoline_addr(SB)/8, $libc_socketpair_trampoline<>(SB) TEXT libc_recvfrom_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_recvfrom(SB) - GLOBL ·libc_recvfrom_trampoline_addr(SB), RODATA, $8 DATA ·libc_recvfrom_trampoline_addr(SB)/8, $libc_recvfrom_trampoline<>(SB) TEXT libc_sendto_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendto(SB) - GLOBL ·libc_sendto_trampoline_addr(SB), RODATA, $8 DATA ·libc_sendto_trampoline_addr(SB)/8, $libc_sendto_trampoline<>(SB) TEXT libc_recvmsg_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_recvmsg(SB) - GLOBL ·libc_recvmsg_trampoline_addr(SB), RODATA, $8 DATA ·libc_recvmsg_trampoline_addr(SB)/8, $libc_recvmsg_trampoline<>(SB) TEXT libc_sendmsg_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendmsg(SB) - GLOBL ·libc_sendmsg_trampoline_addr(SB), RODATA, $8 DATA ·libc_sendmsg_trampoline_addr(SB)/8, $libc_sendmsg_trampoline<>(SB) TEXT libc_kevent_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_kevent(SB) - GLOBL ·libc_kevent_trampoline_addr(SB), RODATA, $8 DATA ·libc_kevent_trampoline_addr(SB)/8, $libc_kevent_trampoline<>(SB) TEXT libc_utimes_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_utimes(SB) - GLOBL ·libc_utimes_trampoline_addr(SB), RODATA, $8 DATA ·libc_utimes_trampoline_addr(SB)/8, $libc_utimes_trampoline<>(SB) TEXT libc_futimes_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_futimes(SB) - GLOBL ·libc_futimes_trampoline_addr(SB), RODATA, $8 DATA ·libc_futimes_trampoline_addr(SB)/8, $libc_futimes_trampoline<>(SB) TEXT libc_poll_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_poll(SB) - GLOBL ·libc_poll_trampoline_addr(SB), RODATA, $8 DATA ·libc_poll_trampoline_addr(SB)/8, $libc_poll_trampoline<>(SB) TEXT libc_madvise_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_madvise(SB) - GLOBL ·libc_madvise_trampoline_addr(SB), RODATA, $8 DATA ·libc_madvise_trampoline_addr(SB)/8, $libc_madvise_trampoline<>(SB) TEXT libc_mlock_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mlock(SB) - GLOBL ·libc_mlock_trampoline_addr(SB), RODATA, $8 DATA ·libc_mlock_trampoline_addr(SB)/8, $libc_mlock_trampoline<>(SB) TEXT libc_mlockall_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mlockall(SB) - GLOBL ·libc_mlockall_trampoline_addr(SB), RODATA, $8 DATA ·libc_mlockall_trampoline_addr(SB)/8, $libc_mlockall_trampoline<>(SB) TEXT libc_mprotect_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mprotect(SB) - GLOBL ·libc_mprotect_trampoline_addr(SB), RODATA, $8 DATA ·libc_mprotect_trampoline_addr(SB)/8, $libc_mprotect_trampoline<>(SB) TEXT libc_msync_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_msync(SB) - GLOBL ·libc_msync_trampoline_addr(SB), RODATA, $8 DATA ·libc_msync_trampoline_addr(SB)/8, $libc_msync_trampoline<>(SB) TEXT libc_munlock_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_munlock(SB) - GLOBL ·libc_munlock_trampoline_addr(SB), RODATA, $8 DATA ·libc_munlock_trampoline_addr(SB)/8, $libc_munlock_trampoline<>(SB) TEXT libc_munlockall_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_munlockall(SB) - GLOBL ·libc_munlockall_trampoline_addr(SB), RODATA, $8 DATA ·libc_munlockall_trampoline_addr(SB)/8, $libc_munlockall_trampoline<>(SB) TEXT libc_closedir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_closedir(SB) - GLOBL ·libc_closedir_trampoline_addr(SB), RODATA, $8 DATA ·libc_closedir_trampoline_addr(SB)/8, $libc_closedir_trampoline<>(SB) TEXT libc_readdir_r_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_readdir_r(SB) - GLOBL ·libc_readdir_r_trampoline_addr(SB), RODATA, $8 DATA ·libc_readdir_r_trampoline_addr(SB)/8, $libc_readdir_r_trampoline<>(SB) TEXT libc_pipe_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_pipe(SB) - GLOBL ·libc_pipe_trampoline_addr(SB), RODATA, $8 DATA ·libc_pipe_trampoline_addr(SB)/8, $libc_pipe_trampoline<>(SB) TEXT libc_getxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getxattr(SB) - GLOBL ·libc_getxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_getxattr_trampoline_addr(SB)/8, $libc_getxattr_trampoline<>(SB) TEXT libc_fgetxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fgetxattr(SB) - GLOBL ·libc_fgetxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_fgetxattr_trampoline_addr(SB)/8, $libc_fgetxattr_trampoline<>(SB) TEXT libc_setxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setxattr(SB) - GLOBL ·libc_setxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_setxattr_trampoline_addr(SB)/8, $libc_setxattr_trampoline<>(SB) TEXT libc_fsetxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fsetxattr(SB) - GLOBL ·libc_fsetxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_fsetxattr_trampoline_addr(SB)/8, $libc_fsetxattr_trampoline<>(SB) TEXT libc_removexattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_removexattr(SB) - GLOBL ·libc_removexattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_removexattr_trampoline_addr(SB)/8, $libc_removexattr_trampoline<>(SB) TEXT libc_fremovexattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fremovexattr(SB) - GLOBL ·libc_fremovexattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_fremovexattr_trampoline_addr(SB)/8, $libc_fremovexattr_trampoline<>(SB) TEXT libc_listxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_listxattr(SB) - GLOBL ·libc_listxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_listxattr_trampoline_addr(SB)/8, $libc_listxattr_trampoline<>(SB) TEXT libc_flistxattr_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_flistxattr(SB) - GLOBL ·libc_flistxattr_trampoline_addr(SB), RODATA, $8 DATA ·libc_flistxattr_trampoline_addr(SB)/8, $libc_flistxattr_trampoline<>(SB) TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_utimensat(SB) - GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8 DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB) TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fcntl(SB) - GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8 DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB) TEXT libc_kill_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_kill(SB) - GLOBL ·libc_kill_trampoline_addr(SB), RODATA, $8 DATA ·libc_kill_trampoline_addr(SB)/8, $libc_kill_trampoline<>(SB) TEXT libc_ioctl_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ioctl(SB) - GLOBL ·libc_ioctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_ioctl_trampoline_addr(SB)/8, $libc_ioctl_trampoline<>(SB) TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sysctl(SB) - GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendfile(SB) - GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8 DATA ·libc_sendfile_trampoline_addr(SB)/8, $libc_sendfile_trampoline<>(SB) TEXT libc_shmat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_shmat(SB) - GLOBL ·libc_shmat_trampoline_addr(SB), RODATA, $8 DATA ·libc_shmat_trampoline_addr(SB)/8, $libc_shmat_trampoline<>(SB) TEXT libc_shmctl_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_shmctl(SB) - GLOBL ·libc_shmctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_shmctl_trampoline_addr(SB)/8, $libc_shmctl_trampoline<>(SB) TEXT libc_shmdt_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_shmdt(SB) - GLOBL ·libc_shmdt_trampoline_addr(SB), RODATA, $8 DATA ·libc_shmdt_trampoline_addr(SB)/8, $libc_shmdt_trampoline<>(SB) TEXT libc_shmget_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_shmget(SB) - GLOBL ·libc_shmget_trampoline_addr(SB), RODATA, $8 DATA ·libc_shmget_trampoline_addr(SB)/8, $libc_shmget_trampoline<>(SB) TEXT libc_access_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_access(SB) - GLOBL ·libc_access_trampoline_addr(SB), RODATA, $8 DATA ·libc_access_trampoline_addr(SB)/8, $libc_access_trampoline<>(SB) TEXT libc_adjtime_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_adjtime(SB) - GLOBL ·libc_adjtime_trampoline_addr(SB), RODATA, $8 DATA ·libc_adjtime_trampoline_addr(SB)/8, $libc_adjtime_trampoline<>(SB) TEXT libc_chdir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_chdir(SB) - GLOBL ·libc_chdir_trampoline_addr(SB), RODATA, $8 DATA ·libc_chdir_trampoline_addr(SB)/8, $libc_chdir_trampoline<>(SB) TEXT libc_chflags_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_chflags(SB) - GLOBL ·libc_chflags_trampoline_addr(SB), RODATA, $8 DATA ·libc_chflags_trampoline_addr(SB)/8, $libc_chflags_trampoline<>(SB) TEXT libc_chmod_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_chmod(SB) - GLOBL ·libc_chmod_trampoline_addr(SB), RODATA, $8 DATA ·libc_chmod_trampoline_addr(SB)/8, $libc_chmod_trampoline<>(SB) TEXT libc_chown_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_chown(SB) - GLOBL ·libc_chown_trampoline_addr(SB), RODATA, $8 DATA ·libc_chown_trampoline_addr(SB)/8, $libc_chown_trampoline<>(SB) TEXT libc_chroot_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_chroot(SB) - GLOBL ·libc_chroot_trampoline_addr(SB), RODATA, $8 DATA ·libc_chroot_trampoline_addr(SB)/8, $libc_chroot_trampoline<>(SB) TEXT libc_clock_gettime_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_clock_gettime(SB) - GLOBL ·libc_clock_gettime_trampoline_addr(SB), RODATA, $8 DATA ·libc_clock_gettime_trampoline_addr(SB)/8, $libc_clock_gettime_trampoline<>(SB) TEXT libc_close_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_close(SB) - GLOBL ·libc_close_trampoline_addr(SB), RODATA, $8 DATA ·libc_close_trampoline_addr(SB)/8, $libc_close_trampoline<>(SB) TEXT libc_clonefile_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_clonefile(SB) - GLOBL ·libc_clonefile_trampoline_addr(SB), RODATA, $8 DATA ·libc_clonefile_trampoline_addr(SB)/8, $libc_clonefile_trampoline<>(SB) TEXT libc_clonefileat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_clonefileat(SB) - GLOBL ·libc_clonefileat_trampoline_addr(SB), RODATA, $8 DATA ·libc_clonefileat_trampoline_addr(SB)/8, $libc_clonefileat_trampoline<>(SB) TEXT libc_dup_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_dup(SB) - GLOBL ·libc_dup_trampoline_addr(SB), RODATA, $8 DATA ·libc_dup_trampoline_addr(SB)/8, $libc_dup_trampoline<>(SB) TEXT libc_dup2_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_dup2(SB) - GLOBL ·libc_dup2_trampoline_addr(SB), RODATA, $8 DATA ·libc_dup2_trampoline_addr(SB)/8, $libc_dup2_trampoline<>(SB) TEXT libc_exchangedata_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_exchangedata(SB) - GLOBL ·libc_exchangedata_trampoline_addr(SB), RODATA, $8 DATA ·libc_exchangedata_trampoline_addr(SB)/8, $libc_exchangedata_trampoline<>(SB) TEXT libc_exit_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_exit(SB) - GLOBL ·libc_exit_trampoline_addr(SB), RODATA, $8 DATA ·libc_exit_trampoline_addr(SB)/8, $libc_exit_trampoline<>(SB) TEXT libc_faccessat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_faccessat(SB) - GLOBL ·libc_faccessat_trampoline_addr(SB), RODATA, $8 DATA ·libc_faccessat_trampoline_addr(SB)/8, $libc_faccessat_trampoline<>(SB) TEXT libc_fchdir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchdir(SB) - GLOBL ·libc_fchdir_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchdir_trampoline_addr(SB)/8, $libc_fchdir_trampoline<>(SB) TEXT libc_fchflags_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchflags(SB) - GLOBL ·libc_fchflags_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchflags_trampoline_addr(SB)/8, $libc_fchflags_trampoline<>(SB) TEXT libc_fchmod_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchmod(SB) - GLOBL ·libc_fchmod_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchmod_trampoline_addr(SB)/8, $libc_fchmod_trampoline<>(SB) TEXT libc_fchmodat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchmodat(SB) - GLOBL ·libc_fchmodat_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchmodat_trampoline_addr(SB)/8, $libc_fchmodat_trampoline<>(SB) TEXT libc_fchown_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchown(SB) - GLOBL ·libc_fchown_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchown_trampoline_addr(SB)/8, $libc_fchown_trampoline<>(SB) TEXT libc_fchownat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fchownat(SB) - GLOBL ·libc_fchownat_trampoline_addr(SB), RODATA, $8 DATA ·libc_fchownat_trampoline_addr(SB)/8, $libc_fchownat_trampoline<>(SB) TEXT libc_fclonefileat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fclonefileat(SB) - GLOBL ·libc_fclonefileat_trampoline_addr(SB), RODATA, $8 DATA ·libc_fclonefileat_trampoline_addr(SB)/8, $libc_fclonefileat_trampoline<>(SB) TEXT libc_flock_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_flock(SB) - GLOBL ·libc_flock_trampoline_addr(SB), RODATA, $8 DATA ·libc_flock_trampoline_addr(SB)/8, $libc_flock_trampoline<>(SB) TEXT libc_fpathconf_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fpathconf(SB) - GLOBL ·libc_fpathconf_trampoline_addr(SB), RODATA, $8 DATA ·libc_fpathconf_trampoline_addr(SB)/8, $libc_fpathconf_trampoline<>(SB) TEXT libc_fsync_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fsync(SB) - GLOBL ·libc_fsync_trampoline_addr(SB), RODATA, $8 DATA ·libc_fsync_trampoline_addr(SB)/8, $libc_fsync_trampoline<>(SB) TEXT libc_ftruncate_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ftruncate(SB) - GLOBL ·libc_ftruncate_trampoline_addr(SB), RODATA, $8 DATA ·libc_ftruncate_trampoline_addr(SB)/8, $libc_ftruncate_trampoline<>(SB) TEXT libc_getcwd_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getcwd(SB) - GLOBL ·libc_getcwd_trampoline_addr(SB), RODATA, $8 DATA ·libc_getcwd_trampoline_addr(SB)/8, $libc_getcwd_trampoline<>(SB) TEXT libc_getdtablesize_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getdtablesize(SB) - GLOBL ·libc_getdtablesize_trampoline_addr(SB), RODATA, $8 DATA ·libc_getdtablesize_trampoline_addr(SB)/8, $libc_getdtablesize_trampoline<>(SB) TEXT libc_getegid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getegid(SB) - GLOBL ·libc_getegid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getegid_trampoline_addr(SB)/8, $libc_getegid_trampoline<>(SB) TEXT libc_geteuid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_geteuid(SB) - GLOBL ·libc_geteuid_trampoline_addr(SB), RODATA, $8 DATA ·libc_geteuid_trampoline_addr(SB)/8, $libc_geteuid_trampoline<>(SB) TEXT libc_getgid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getgid(SB) - GLOBL ·libc_getgid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getgid_trampoline_addr(SB)/8, $libc_getgid_trampoline<>(SB) TEXT libc_getpgid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getpgid(SB) - GLOBL ·libc_getpgid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getpgid_trampoline_addr(SB)/8, $libc_getpgid_trampoline<>(SB) TEXT libc_getpgrp_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getpgrp(SB) - GLOBL ·libc_getpgrp_trampoline_addr(SB), RODATA, $8 DATA ·libc_getpgrp_trampoline_addr(SB)/8, $libc_getpgrp_trampoline<>(SB) TEXT libc_getpid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getpid(SB) - GLOBL ·libc_getpid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getpid_trampoline_addr(SB)/8, $libc_getpid_trampoline<>(SB) TEXT libc_getppid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getppid(SB) - GLOBL ·libc_getppid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getppid_trampoline_addr(SB)/8, $libc_getppid_trampoline<>(SB) TEXT libc_getpriority_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getpriority(SB) - GLOBL ·libc_getpriority_trampoline_addr(SB), RODATA, $8 DATA ·libc_getpriority_trampoline_addr(SB)/8, $libc_getpriority_trampoline<>(SB) TEXT libc_getrlimit_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getrlimit(SB) - GLOBL ·libc_getrlimit_trampoline_addr(SB), RODATA, $8 DATA ·libc_getrlimit_trampoline_addr(SB)/8, $libc_getrlimit_trampoline<>(SB) TEXT libc_getrusage_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getrusage(SB) - GLOBL ·libc_getrusage_trampoline_addr(SB), RODATA, $8 DATA ·libc_getrusage_trampoline_addr(SB)/8, $libc_getrusage_trampoline<>(SB) TEXT libc_getsid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getsid(SB) - GLOBL ·libc_getsid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getsid_trampoline_addr(SB)/8, $libc_getsid_trampoline<>(SB) TEXT libc_gettimeofday_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_gettimeofday(SB) - GLOBL ·libc_gettimeofday_trampoline_addr(SB), RODATA, $8 DATA ·libc_gettimeofday_trampoline_addr(SB)/8, $libc_gettimeofday_trampoline<>(SB) TEXT libc_getuid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getuid(SB) - GLOBL ·libc_getuid_trampoline_addr(SB), RODATA, $8 DATA ·libc_getuid_trampoline_addr(SB)/8, $libc_getuid_trampoline<>(SB) TEXT libc_issetugid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_issetugid(SB) - GLOBL ·libc_issetugid_trampoline_addr(SB), RODATA, $8 DATA ·libc_issetugid_trampoline_addr(SB)/8, $libc_issetugid_trampoline<>(SB) TEXT libc_kqueue_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_kqueue(SB) - GLOBL ·libc_kqueue_trampoline_addr(SB), RODATA, $8 DATA ·libc_kqueue_trampoline_addr(SB)/8, $libc_kqueue_trampoline<>(SB) TEXT libc_lchown_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_lchown(SB) - GLOBL ·libc_lchown_trampoline_addr(SB), RODATA, $8 DATA ·libc_lchown_trampoline_addr(SB)/8, $libc_lchown_trampoline<>(SB) TEXT libc_link_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_link(SB) - GLOBL ·libc_link_trampoline_addr(SB), RODATA, $8 DATA ·libc_link_trampoline_addr(SB)/8, $libc_link_trampoline<>(SB) TEXT libc_linkat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_linkat(SB) - GLOBL ·libc_linkat_trampoline_addr(SB), RODATA, $8 DATA ·libc_linkat_trampoline_addr(SB)/8, $libc_linkat_trampoline<>(SB) TEXT libc_listen_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_listen(SB) - GLOBL ·libc_listen_trampoline_addr(SB), RODATA, $8 DATA ·libc_listen_trampoline_addr(SB)/8, $libc_listen_trampoline<>(SB) TEXT libc_mkdir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mkdir(SB) - GLOBL ·libc_mkdir_trampoline_addr(SB), RODATA, $8 DATA ·libc_mkdir_trampoline_addr(SB)/8, $libc_mkdir_trampoline<>(SB) TEXT libc_mkdirat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mkdirat(SB) - GLOBL ·libc_mkdirat_trampoline_addr(SB), RODATA, $8 DATA ·libc_mkdirat_trampoline_addr(SB)/8, $libc_mkdirat_trampoline<>(SB) TEXT libc_mkfifo_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mkfifo(SB) - GLOBL ·libc_mkfifo_trampoline_addr(SB), RODATA, $8 DATA ·libc_mkfifo_trampoline_addr(SB)/8, $libc_mkfifo_trampoline<>(SB) TEXT libc_mknod_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mknod(SB) - GLOBL ·libc_mknod_trampoline_addr(SB), RODATA, $8 DATA ·libc_mknod_trampoline_addr(SB)/8, $libc_mknod_trampoline<>(SB) TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mount(SB) - GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $8 DATA ·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB) TEXT libc_open_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_open(SB) - GLOBL ·libc_open_trampoline_addr(SB), RODATA, $8 DATA ·libc_open_trampoline_addr(SB)/8, $libc_open_trampoline<>(SB) TEXT libc_openat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_openat(SB) - GLOBL ·libc_openat_trampoline_addr(SB), RODATA, $8 DATA ·libc_openat_trampoline_addr(SB)/8, $libc_openat_trampoline<>(SB) TEXT libc_pathconf_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_pathconf(SB) - GLOBL ·libc_pathconf_trampoline_addr(SB), RODATA, $8 DATA ·libc_pathconf_trampoline_addr(SB)/8, $libc_pathconf_trampoline<>(SB) TEXT libc_pread_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_pread(SB) - GLOBL ·libc_pread_trampoline_addr(SB), RODATA, $8 DATA ·libc_pread_trampoline_addr(SB)/8, $libc_pread_trampoline<>(SB) TEXT libc_pwrite_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_pwrite(SB) - GLOBL ·libc_pwrite_trampoline_addr(SB), RODATA, $8 DATA ·libc_pwrite_trampoline_addr(SB)/8, $libc_pwrite_trampoline<>(SB) TEXT libc_read_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_read(SB) - GLOBL ·libc_read_trampoline_addr(SB), RODATA, $8 DATA ·libc_read_trampoline_addr(SB)/8, $libc_read_trampoline<>(SB) TEXT libc_readlink_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_readlink(SB) - GLOBL ·libc_readlink_trampoline_addr(SB), RODATA, $8 DATA ·libc_readlink_trampoline_addr(SB)/8, $libc_readlink_trampoline<>(SB) TEXT libc_readlinkat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_readlinkat(SB) - GLOBL ·libc_readlinkat_trampoline_addr(SB), RODATA, $8 DATA ·libc_readlinkat_trampoline_addr(SB)/8, $libc_readlinkat_trampoline<>(SB) TEXT libc_rename_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_rename(SB) - GLOBL ·libc_rename_trampoline_addr(SB), RODATA, $8 DATA ·libc_rename_trampoline_addr(SB)/8, $libc_rename_trampoline<>(SB) TEXT libc_renameat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_renameat(SB) - GLOBL ·libc_renameat_trampoline_addr(SB), RODATA, $8 DATA ·libc_renameat_trampoline_addr(SB)/8, $libc_renameat_trampoline<>(SB) TEXT libc_revoke_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_revoke(SB) - GLOBL ·libc_revoke_trampoline_addr(SB), RODATA, $8 DATA ·libc_revoke_trampoline_addr(SB)/8, $libc_revoke_trampoline<>(SB) TEXT libc_rmdir_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_rmdir(SB) - GLOBL ·libc_rmdir_trampoline_addr(SB), RODATA, $8 DATA ·libc_rmdir_trampoline_addr(SB)/8, $libc_rmdir_trampoline<>(SB) TEXT libc_lseek_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_lseek(SB) - GLOBL ·libc_lseek_trampoline_addr(SB), RODATA, $8 DATA ·libc_lseek_trampoline_addr(SB)/8, $libc_lseek_trampoline<>(SB) TEXT libc_select_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_select(SB) - GLOBL ·libc_select_trampoline_addr(SB), RODATA, $8 DATA ·libc_select_trampoline_addr(SB)/8, $libc_select_trampoline<>(SB) @@ -712,192 +595,160 @@ DATA ·libc_setattrlist_trampoline_addr(SB)/8, $libc_setattrlist_trampoline<>(SB TEXT libc_setegid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setegid(SB) - GLOBL ·libc_setegid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setegid_trampoline_addr(SB)/8, $libc_setegid_trampoline<>(SB) TEXT libc_seteuid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_seteuid(SB) - GLOBL ·libc_seteuid_trampoline_addr(SB), RODATA, $8 DATA ·libc_seteuid_trampoline_addr(SB)/8, $libc_seteuid_trampoline<>(SB) TEXT libc_setgid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setgid(SB) - GLOBL ·libc_setgid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setgid_trampoline_addr(SB)/8, $libc_setgid_trampoline<>(SB) TEXT libc_setlogin_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setlogin(SB) - GLOBL ·libc_setlogin_trampoline_addr(SB), RODATA, $8 DATA ·libc_setlogin_trampoline_addr(SB)/8, $libc_setlogin_trampoline<>(SB) TEXT libc_setpgid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setpgid(SB) - GLOBL ·libc_setpgid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setpgid_trampoline_addr(SB)/8, $libc_setpgid_trampoline<>(SB) TEXT libc_setpriority_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setpriority(SB) - GLOBL ·libc_setpriority_trampoline_addr(SB), RODATA, $8 DATA ·libc_setpriority_trampoline_addr(SB)/8, $libc_setpriority_trampoline<>(SB) TEXT libc_setprivexec_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setprivexec(SB) - GLOBL ·libc_setprivexec_trampoline_addr(SB), RODATA, $8 DATA ·libc_setprivexec_trampoline_addr(SB)/8, $libc_setprivexec_trampoline<>(SB) TEXT libc_setregid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setregid(SB) - GLOBL ·libc_setregid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setregid_trampoline_addr(SB)/8, $libc_setregid_trampoline<>(SB) TEXT libc_setreuid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setreuid(SB) - GLOBL ·libc_setreuid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setreuid_trampoline_addr(SB)/8, $libc_setreuid_trampoline<>(SB) TEXT libc_setsid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setsid(SB) - GLOBL ·libc_setsid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setsid_trampoline_addr(SB)/8, $libc_setsid_trampoline<>(SB) TEXT libc_settimeofday_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_settimeofday(SB) - GLOBL ·libc_settimeofday_trampoline_addr(SB), RODATA, $8 DATA ·libc_settimeofday_trampoline_addr(SB)/8, $libc_settimeofday_trampoline<>(SB) TEXT libc_setuid_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_setuid(SB) - GLOBL ·libc_setuid_trampoline_addr(SB), RODATA, $8 DATA ·libc_setuid_trampoline_addr(SB)/8, $libc_setuid_trampoline<>(SB) TEXT libc_symlink_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_symlink(SB) - GLOBL ·libc_symlink_trampoline_addr(SB), RODATA, $8 DATA ·libc_symlink_trampoline_addr(SB)/8, $libc_symlink_trampoline<>(SB) TEXT libc_symlinkat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_symlinkat(SB) - GLOBL ·libc_symlinkat_trampoline_addr(SB), RODATA, $8 DATA ·libc_symlinkat_trampoline_addr(SB)/8, $libc_symlinkat_trampoline<>(SB) TEXT libc_sync_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sync(SB) - GLOBL ·libc_sync_trampoline_addr(SB), RODATA, $8 DATA ·libc_sync_trampoline_addr(SB)/8, $libc_sync_trampoline<>(SB) TEXT libc_truncate_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_truncate(SB) - GLOBL ·libc_truncate_trampoline_addr(SB), RODATA, $8 DATA ·libc_truncate_trampoline_addr(SB)/8, $libc_truncate_trampoline<>(SB) TEXT libc_umask_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_umask(SB) - GLOBL ·libc_umask_trampoline_addr(SB), RODATA, $8 DATA ·libc_umask_trampoline_addr(SB)/8, $libc_umask_trampoline<>(SB) TEXT libc_undelete_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_undelete(SB) - GLOBL ·libc_undelete_trampoline_addr(SB), RODATA, $8 DATA ·libc_undelete_trampoline_addr(SB)/8, $libc_undelete_trampoline<>(SB) TEXT libc_unlink_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_unlink(SB) - GLOBL ·libc_unlink_trampoline_addr(SB), RODATA, $8 DATA ·libc_unlink_trampoline_addr(SB)/8, $libc_unlink_trampoline<>(SB) TEXT libc_unlinkat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_unlinkat(SB) - GLOBL ·libc_unlinkat_trampoline_addr(SB), RODATA, $8 DATA ·libc_unlinkat_trampoline_addr(SB)/8, $libc_unlinkat_trampoline<>(SB) TEXT libc_unmount_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_unmount(SB) - GLOBL ·libc_unmount_trampoline_addr(SB), RODATA, $8 DATA ·libc_unmount_trampoline_addr(SB)/8, $libc_unmount_trampoline<>(SB) TEXT libc_write_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_write(SB) - GLOBL ·libc_write_trampoline_addr(SB), RODATA, $8 DATA ·libc_write_trampoline_addr(SB)/8, $libc_write_trampoline<>(SB) TEXT libc_mmap_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_mmap(SB) - GLOBL ·libc_mmap_trampoline_addr(SB), RODATA, $8 DATA ·libc_mmap_trampoline_addr(SB)/8, $libc_mmap_trampoline<>(SB) TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_munmap(SB) - GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8 DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB) TEXT libc_fstat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fstat(SB) - GLOBL ·libc_fstat_trampoline_addr(SB), RODATA, $8 DATA ·libc_fstat_trampoline_addr(SB)/8, $libc_fstat_trampoline<>(SB) TEXT libc_fstatat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fstatat(SB) - GLOBL ·libc_fstatat_trampoline_addr(SB), RODATA, $8 DATA ·libc_fstatat_trampoline_addr(SB)/8, $libc_fstatat_trampoline<>(SB) TEXT libc_fstatfs_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_fstatfs(SB) - GLOBL ·libc_fstatfs_trampoline_addr(SB), RODATA, $8 DATA ·libc_fstatfs_trampoline_addr(SB)/8, $libc_fstatfs_trampoline<>(SB) TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_getfsstat(SB) - GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8 DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB) TEXT libc_lstat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_lstat(SB) - GLOBL ·libc_lstat_trampoline_addr(SB), RODATA, $8 DATA ·libc_lstat_trampoline_addr(SB)/8, $libc_lstat_trampoline<>(SB) TEXT libc_ptrace_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ptrace(SB) - GLOBL ·libc_ptrace_trampoline_addr(SB), RODATA, $8 DATA ·libc_ptrace_trampoline_addr(SB)/8, $libc_ptrace_trampoline<>(SB) TEXT libc_stat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_stat(SB) - GLOBL ·libc_stat_trampoline_addr(SB), RODATA, $8 DATA ·libc_stat_trampoline_addr(SB)/8, $libc_stat_trampoline<>(SB) TEXT libc_statfs_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_statfs(SB) - GLOBL ·libc_statfs_trampoline_addr(SB), RODATA, $8 DATA ·libc_statfs_trampoline_addr(SB)/8, $libc_statfs_trampoline<>(SB) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_dragonfly_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_dragonfly_amd64.go index 0eabac7ad..aad65fc79 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_dragonfly_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_dragonfly_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build dragonfly && amd64 -// +build dragonfly,amd64 package unix @@ -1642,28 +1641,6 @@ func munmap(addr uintptr, length uintptr) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) { r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0) nfd = int(r0) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go index ee313eb00..c0096391a 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build freebsd && 386 -// +build freebsd,386 package unix @@ -1862,28 +1861,6 @@ func munmap(addr uintptr, length uintptr) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) { r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0) nfd = int(r0) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go index 4c986e448..7664df749 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build freebsd && amd64 -// +build freebsd,amd64 package unix @@ -1862,28 +1861,6 @@ func munmap(addr uintptr, length uintptr) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) { r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0) nfd = int(r0) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go index 555216944..ae099182c 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build freebsd && arm -// +build freebsd,arm package unix @@ -1862,28 +1861,6 @@ func munmap(addr uintptr, length uintptr) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) { r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0) nfd = int(r0) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go index 67a226fbf..11fd5d45b 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build freebsd && arm64 -// +build freebsd,arm64 package unix @@ -1862,28 +1861,6 @@ func munmap(addr uintptr, length uintptr) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) { r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0) nfd = int(r0) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_riscv64.go index f0b9ddaaa..c3d2d6530 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build freebsd && riscv64 -// +build freebsd,riscv64 package unix @@ -1862,28 +1861,6 @@ func munmap(addr uintptr, length uintptr) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func accept4(fd int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (nfd int, err error) { r0, _, e1 := Syscall6(SYS_ACCEPT4, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0) nfd = int(r0) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_illumos_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_illumos_amd64.go index b57c7050d..c698cbc01 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_illumos_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_illumos_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build illumos && amd64 -// +build illumos,amd64 package unix @@ -40,7 +39,7 @@ func readv(fd int, iovs []Iovec) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procreadv)), 3, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(iovs)), 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -55,7 +54,7 @@ func preadv(fd int, iovs []Iovec, off int64) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procpreadv)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(iovs)), uintptr(off), 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -70,7 +69,7 @@ func writev(fd int, iovs []Iovec) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procwritev)), 3, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(iovs)), 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -85,7 +84,7 @@ func pwritev(fd int, iovs []Iovec, off int64) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procpwritev)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(iovs)), uintptr(off), 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -96,7 +95,7 @@ func accept4(s int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (fd int, r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procaccept4)), 4, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0) fd = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux.go b/vendor/golang.org/x/sys/unix/zsyscall_linux.go index a07321bed..87d8612a1 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux.go @@ -1,7 +1,6 @@ // Code generated by mkmerge; DO NOT EDIT. //go:build linux -// +build linux package unix @@ -38,6 +37,21 @@ func fchmodat(dirfd int, path string, mode uint32) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fchmodat2(dirfd int, path string, mode uint32, flags int) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall6(SYS_FCHMODAT2, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ioctl(fd int, req uint, arg uintptr) (err error) { _, _, e1 := Syscall(SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { @@ -892,6 +906,16 @@ func Fspick(dirfd int, pathName string, flags int) (fd int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fsconfig(fd int, cmd uint, key *byte, value *byte, aux int) (err error) { + _, _, e1 := Syscall6(SYS_FSCONFIG, uintptr(fd), uintptr(cmd), uintptr(unsafe.Pointer(key)), uintptr(unsafe.Pointer(value)), uintptr(aux), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Getdents(fd int, buf []byte) (n int, err error) { var _p0 unsafe.Pointer if len(buf) > 0 { @@ -1734,28 +1758,6 @@ func exitThread(code int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, p *byte, np int) (n int, err error) { - r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(p)), uintptr(np)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, p *byte, np int) (n int, err error) { - r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(p)), uintptr(np)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func readv(fd int, iovs []Iovec) (n int, err error) { var _p0 unsafe.Pointer if len(iovs) > 0 { @@ -2197,3 +2199,33 @@ func getresgid(rgid *_C_int, egid *_C_int, sgid *_C_int) { RawSyscallNoError(SYS_GETRESGID, uintptr(unsafe.Pointer(rgid)), uintptr(unsafe.Pointer(egid)), uintptr(unsafe.Pointer(sgid))) return } + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func schedSetattr(pid int, attr *SchedAttr, flags uint) (err error) { + _, _, e1 := Syscall(SYS_SCHED_SETATTR, uintptr(pid), uintptr(unsafe.Pointer(attr)), uintptr(flags)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func schedGetattr(pid int, attr *SchedAttr, size uint, flags uint) (err error) { + _, _, e1 := Syscall6(SYS_SCHED_GETATTR, uintptr(pid), uintptr(unsafe.Pointer(attr)), uintptr(size), uintptr(flags), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint) (err error) { + _, _, e1 := Syscall6(SYS_CACHESTAT, uintptr(fd), uintptr(unsafe.Pointer(crange)), uintptr(unsafe.Pointer(cstat)), uintptr(flags), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go index 07b549cc2..4def3e9fc 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && 386 -// +build linux,386 package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go index 5f481bf83..fef2bc8ba 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && amd64 -// +build linux,amd64 package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go index 824cd52c7..a9fd76a88 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && arm -// +build linux,arm package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go index e77aecfe9..460065028 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && arm64 -// +build linux,arm64 package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_loong64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_loong64.go index 806ffd1e1..c8987d264 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_loong64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_loong64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && loong64 -// +build linux,loong64 package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go index 961a3afb7..921f43061 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && mips -// +build linux,mips package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go index ed05005e9..44f067829 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && mips64 -// +build linux,mips64 package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go index d365b718f..e7fa0abf0 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && mips64le -// +build linux,mips64le package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go index c3f1b8bbd..8c5125675 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && mipsle -// +build linux,mipsle package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go index a6574cf98..7392fd45e 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && ppc -// +build linux,ppc package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go index f40990264..41180434e 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && ppc64 -// +build linux,ppc64 package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go index 9dfcc2997..40c6ce7ae 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && ppc64le -// +build linux,ppc64le package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go index 0ab4f2ed7..2cfe34adb 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && riscv64 -// +build linux,riscv64 package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go index 6cde32237..61e6f0709 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && s390x -// +build linux,s390x package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go index 5253d65bf..834b84204 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build linux && sparc64 -// +build linux,sparc64 package unix diff --git a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_386.go b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_386.go index 35f499b32..e91ebc14a 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build netbsd && 386 -// +build netbsd,386 package unix @@ -1824,28 +1823,6 @@ func munmap(addr uintptr, length uintptr) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_amd64.go index 3cda65b0d..be28babbc 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build netbsd && amd64 -// +build netbsd,amd64 package unix @@ -1824,28 +1823,6 @@ func munmap(addr uintptr, length uintptr) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm.go index 1e1fea902..fb587e826 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build netbsd && arm -// +build netbsd,arm package unix @@ -1824,28 +1823,6 @@ func munmap(addr uintptr, length uintptr) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm64.go index 3b77da110..d576438bb 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_netbsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build netbsd && arm64 -// +build netbsd,arm64 package unix @@ -1824,28 +1823,6 @@ func munmap(addr uintptr, length uintptr) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go index 9ab9abf72..9dc42410b 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build openbsd && 386 -// +build openbsd,386 package unix @@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) { return } +var libc_ioctl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { @@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fcntl(fd int, cmd int, arg int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_fcntl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_fcntl fcntl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) n = int(r0) @@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) +func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) { return } -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +var libc_getfsstat_trampoline_addr uintptr -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} +//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so" // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT @@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error var libc_utimensat_trampoline_addr uintptr //go:cgo_import_dynamic libc_utimensat utimensat "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func pledge(promises *byte, execpromises *byte) (err error) { + _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pledge_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pledge pledge "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func unveil(path *byte, flags *byte) (err error) { + _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_unveil_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_unveil unveil "libc.so" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s index 3dcacd30d..41b561731 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.s @@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $4 DATA ·libc_sysctl_trampoline_addr(SB)/4, $libc_sysctl_trampoline<>(SB) +TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_fcntl(SB) +GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $4 +DATA ·libc_fcntl_trampoline_addr(SB)/4, $libc_fcntl_trampoline<>(SB) + TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ppoll(SB) GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $4 @@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $4 DATA ·libc_munmap_trampoline_addr(SB)/4, $libc_munmap_trampoline<>(SB) +TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_getfsstat(SB) +GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $4 +DATA ·libc_getfsstat_trampoline_addr(SB)/4, $libc_getfsstat_trampoline<>(SB) + TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_utimensat(SB) GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $4 DATA ·libc_utimensat_trampoline_addr(SB)/4, $libc_utimensat_trampoline<>(SB) + +TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pledge(SB) +GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $4 +DATA ·libc_pledge_trampoline_addr(SB)/4, $libc_pledge_trampoline<>(SB) + +TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_unveil(SB) +GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $4 +DATA ·libc_unveil_trampoline_addr(SB)/4, $libc_unveil_trampoline<>(SB) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go index 915761eab..0d3a0751c 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build openbsd && amd64 -// +build openbsd,amd64 package unix @@ -585,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fcntl(fd int, cmd int, arg int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_fcntl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_fcntl fcntl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) n = int(r0) @@ -2213,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) +func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2222,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) { return } -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +var libc_getfsstat_trampoline_addr uintptr -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} +//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so" // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT @@ -2251,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error var libc_utimensat_trampoline_addr uintptr //go:cgo_import_dynamic libc_utimensat utimensat "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func pledge(promises *byte, execpromises *byte) (err error) { + _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pledge_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pledge pledge "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func unveil(path *byte, flags *byte) (err error) { + _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_unveil_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_unveil unveil "libc.so" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s index 2763620b0..4019a656f 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.s @@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) +TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_fcntl(SB) +GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8 +DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB) + TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ppoll(SB) GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $8 @@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8 DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB) +TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_getfsstat(SB) +GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8 +DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB) + TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_utimensat(SB) GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8 DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB) + +TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pledge(SB) +GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pledge_trampoline_addr(SB)/8, $libc_pledge_trampoline<>(SB) + +TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_unveil(SB) +GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $8 +DATA ·libc_unveil_trampoline_addr(SB)/8, $libc_unveil_trampoline<>(SB) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go index 8e87fdf15..c39f7776d 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build openbsd && arm -// +build openbsd,arm package unix @@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) { return } +var libc_ioctl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { @@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fcntl(fd int, cmd int, arg int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_fcntl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_fcntl fcntl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) n = int(r0) @@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) +func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) { return } -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +var libc_getfsstat_trampoline_addr uintptr -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} +//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so" // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT @@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error var libc_utimensat_trampoline_addr uintptr //go:cgo_import_dynamic libc_utimensat utimensat "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func pledge(promises *byte, execpromises *byte) (err error) { + _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pledge_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pledge pledge "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func unveil(path *byte, flags *byte) (err error) { + _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_unveil_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_unveil unveil "libc.so" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s index c92231404..ac4af24f9 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.s @@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $4 DATA ·libc_sysctl_trampoline_addr(SB)/4, $libc_sysctl_trampoline<>(SB) +TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_fcntl(SB) +GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $4 +DATA ·libc_fcntl_trampoline_addr(SB)/4, $libc_fcntl_trampoline<>(SB) + TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ppoll(SB) GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $4 @@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $4 DATA ·libc_munmap_trampoline_addr(SB)/4, $libc_munmap_trampoline<>(SB) +TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_getfsstat(SB) +GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $4 +DATA ·libc_getfsstat_trampoline_addr(SB)/4, $libc_getfsstat_trampoline<>(SB) + TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_utimensat(SB) GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $4 DATA ·libc_utimensat_trampoline_addr(SB)/4, $libc_utimensat_trampoline<>(SB) + +TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pledge(SB) +GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $4 +DATA ·libc_pledge_trampoline_addr(SB)/4, $libc_pledge_trampoline<>(SB) + +TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_unveil(SB) +GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $4 +DATA ·libc_unveil_trampoline_addr(SB)/4, $libc_unveil_trampoline<>(SB) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go index 12a7a2160..57571d072 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build openbsd && arm64 -// +build openbsd,arm64 package unix @@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) { return } +var libc_ioctl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { @@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fcntl(fd int, cmd int, arg int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_fcntl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_fcntl fcntl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) n = int(r0) @@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) +func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) { return } -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +var libc_getfsstat_trampoline_addr uintptr -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} +//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so" // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT @@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error var libc_utimensat_trampoline_addr uintptr //go:cgo_import_dynamic libc_utimensat utimensat "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func pledge(promises *byte, execpromises *byte) (err error) { + _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pledge_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pledge pledge "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func unveil(path *byte, flags *byte) (err error) { + _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_unveil_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_unveil unveil "libc.so" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s index a6bc32c92..f77d53212 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.s @@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) +TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_fcntl(SB) +GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8 +DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB) + TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ppoll(SB) GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $8 @@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8 DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB) +TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_getfsstat(SB) +GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8 +DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB) + TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_utimensat(SB) GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8 DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB) + +TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pledge(SB) +GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pledge_trampoline_addr(SB)/8, $libc_pledge_trampoline<>(SB) + +TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_unveil(SB) +GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $8 +DATA ·libc_unveil_trampoline_addr(SB)/8, $libc_unveil_trampoline<>(SB) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go index b19e8aa03..e62963e67 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build openbsd && mips64 -// +build openbsd,mips64 package unix @@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) { return } +var libc_ioctl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { @@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fcntl(fd int, cmd int, arg int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_fcntl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_fcntl fcntl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) n = int(r0) @@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) +func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) { return } -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +var libc_getfsstat_trampoline_addr uintptr -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} +//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so" // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT @@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error var libc_utimensat_trampoline_addr uintptr //go:cgo_import_dynamic libc_utimensat utimensat "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func pledge(promises *byte, execpromises *byte) (err error) { + _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pledge_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pledge pledge "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func unveil(path *byte, flags *byte) (err error) { + _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_unveil_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_unveil unveil "libc.so" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s index b4e7bceab..fae140b62 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_mips64.s @@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) +TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_fcntl(SB) +GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8 +DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB) + TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ppoll(SB) GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $8 @@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8 DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB) +TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_getfsstat(SB) +GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8 +DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB) + TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_utimensat(SB) GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8 DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB) + +TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pledge(SB) +GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pledge_trampoline_addr(SB)/8, $libc_pledge_trampoline<>(SB) + +TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_unveil(SB) +GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $8 +DATA ·libc_unveil_trampoline_addr(SB)/8, $libc_unveil_trampoline<>(SB) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go index fb99594c9..00831354c 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build openbsd && ppc64 -// +build openbsd,ppc64 package unix @@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) { return } +var libc_ioctl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { @@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fcntl(fd int, cmd int, arg int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_fcntl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_fcntl fcntl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) n = int(r0) @@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) +func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) { return } -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +var libc_getfsstat_trampoline_addr uintptr -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} +//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so" // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT @@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error var libc_utimensat_trampoline_addr uintptr //go:cgo_import_dynamic libc_utimensat utimensat "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func pledge(promises *byte, execpromises *byte) (err error) { + _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pledge_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pledge pledge "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func unveil(path *byte, flags *byte) (err error) { + _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_unveil_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_unveil unveil "libc.so" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s index ca3f76600..9d1e0ff06 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_ppc64.s @@ -213,6 +213,12 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) +TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0 + CALL libc_fcntl(SB) + RET +GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8 +DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB) + TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0 CALL libc_ppoll(SB) RET @@ -801,8 +807,26 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8 DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB) +TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0 + CALL libc_getfsstat(SB) + RET +GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8 +DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB) + TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0 CALL libc_utimensat(SB) RET GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8 DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB) + +TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0 + CALL libc_pledge(SB) + RET +GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pledge_trampoline_addr(SB)/8, $libc_pledge_trampoline<>(SB) + +TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0 + CALL libc_unveil(SB) + RET +GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $8 +DATA ·libc_unveil_trampoline_addr(SB)/8, $libc_unveil_trampoline<>(SB) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go index 32cbbbc52..79029ed58 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build openbsd && riscv64 -// +build openbsd,riscv64 package unix @@ -549,6 +548,12 @@ func ioctl(fd int, req uint, arg uintptr) (err error) { return } +var libc_ioctl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { @@ -557,10 +562,6 @@ func ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) { return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -583,6 +584,32 @@ var libc_sysctl_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fcntl(fd int, cmd int, arg int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_fcntl_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_fcntl fcntl "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fcntlPtr(fd int, cmd int, arg unsafe.Pointer) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_fcntl_trampoline_addr, uintptr(fd), uintptr(cmd), uintptr(arg)) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) n = int(r0) @@ -2211,8 +2238,8 @@ var libc_munmap_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) +func getfsstat(stat *Statfs_t, bufsize uintptr, flags int) (n int, err error) { + r0, _, e1 := syscall_syscall(libc_getfsstat_trampoline_addr, uintptr(unsafe.Pointer(stat)), uintptr(bufsize), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2220,16 +2247,9 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) { return } -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +var libc_getfsstat_trampoline_addr uintptr -func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} +//go:cgo_import_dynamic libc_getfsstat getfsstat "libc.so" // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT @@ -2249,3 +2269,31 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error var libc_utimensat_trampoline_addr uintptr //go:cgo_import_dynamic libc_utimensat utimensat "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func pledge(promises *byte, execpromises *byte) (err error) { + _, _, e1 := syscall_syscall(libc_pledge_trampoline_addr, uintptr(unsafe.Pointer(promises)), uintptr(unsafe.Pointer(execpromises)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_pledge_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_pledge pledge "libc.so" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func unveil(path *byte, flags *byte) (err error) { + _, _, e1 := syscall_syscall(libc_unveil_trampoline_addr, uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(flags)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_unveil_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_unveil unveil "libc.so" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s index 477a7d5b2..da115f9a4 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_riscv64.s @@ -178,6 +178,11 @@ TEXT libc_sysctl_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_sysctl_trampoline_addr(SB), RODATA, $8 DATA ·libc_sysctl_trampoline_addr(SB)/8, $libc_sysctl_trampoline<>(SB) +TEXT libc_fcntl_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_fcntl(SB) +GLOBL ·libc_fcntl_trampoline_addr(SB), RODATA, $8 +DATA ·libc_fcntl_trampoline_addr(SB)/8, $libc_fcntl_trampoline<>(SB) + TEXT libc_ppoll_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_ppoll(SB) GLOBL ·libc_ppoll_trampoline_addr(SB), RODATA, $8 @@ -668,7 +673,22 @@ TEXT libc_munmap_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_munmap_trampoline_addr(SB), RODATA, $8 DATA ·libc_munmap_trampoline_addr(SB)/8, $libc_munmap_trampoline<>(SB) +TEXT libc_getfsstat_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_getfsstat(SB) +GLOBL ·libc_getfsstat_trampoline_addr(SB), RODATA, $8 +DATA ·libc_getfsstat_trampoline_addr(SB)/8, $libc_getfsstat_trampoline<>(SB) + TEXT libc_utimensat_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_utimensat(SB) GLOBL ·libc_utimensat_trampoline_addr(SB), RODATA, $8 DATA ·libc_utimensat_trampoline_addr(SB)/8, $libc_utimensat_trampoline<>(SB) + +TEXT libc_pledge_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_pledge(SB) +GLOBL ·libc_pledge_trampoline_addr(SB), RODATA, $8 +DATA ·libc_pledge_trampoline_addr(SB)/8, $libc_pledge_trampoline<>(SB) + +TEXT libc_unveil_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_unveil(SB) +GLOBL ·libc_unveil_trampoline_addr(SB), RODATA, $8 +DATA ·libc_unveil_trampoline_addr(SB)/8, $libc_unveil_trampoline<>(SB) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go index 609d1c598..829b87feb 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build solaris && amd64 -// +build solaris,amd64 package unix @@ -436,7 +435,7 @@ func pipe(p *[2]_C_int) (n int, err error) { r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procpipe)), 1, uintptr(unsafe.Pointer(p)), 0, 0, 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -446,7 +445,7 @@ func pipe(p *[2]_C_int) (n int, err error) { func pipe2(p *[2]_C_int, flags int) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procpipe2)), 2, uintptr(unsafe.Pointer(p)), uintptr(flags), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -456,7 +455,7 @@ func pipe2(p *[2]_C_int, flags int) (err error) { func getsockname(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procgetsockname)), 3, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -471,7 +470,7 @@ func Getcwd(buf []byte) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procGetcwd)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(len(buf)), 0, 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -482,7 +481,7 @@ func getgroups(ngid int, gid *_Gid_t) (n int, err error) { r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procgetgroups)), 2, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0, 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -492,7 +491,7 @@ func getgroups(ngid int, gid *_Gid_t) (n int, err error) { func setgroups(ngid int, gid *_Gid_t) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procsetgroups)), 2, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -503,7 +502,7 @@ func wait4(pid int32, statusp *_C_int, options int, rusage *Rusage) (wpid int32, r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procwait4)), 4, uintptr(pid), uintptr(unsafe.Pointer(statusp)), uintptr(options), uintptr(unsafe.Pointer(rusage)), 0, 0) wpid = int32(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -518,7 +517,7 @@ func gethostname(buf []byte) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procgethostname)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(len(buf)), 0, 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -533,7 +532,7 @@ func utimes(path string, times *[2]Timeval) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procutimes)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -548,7 +547,7 @@ func utimensat(fd int, path string, times *[2]Timespec, flag int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procutimensat)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), uintptr(flag), 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -559,7 +558,7 @@ func fcntl(fd int, cmd int, arg int) (val int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procfcntl)), 3, uintptr(fd), uintptr(cmd), uintptr(arg), 0, 0, 0) val = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -569,7 +568,7 @@ func fcntl(fd int, cmd int, arg int) (val int, err error) { func futimesat(fildes int, path *byte, times *[2]Timeval) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procfutimesat)), 3, uintptr(fildes), uintptr(unsafe.Pointer(path)), uintptr(unsafe.Pointer(times)), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -580,7 +579,7 @@ func accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procaccept)), 3, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), 0, 0, 0) fd = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -591,7 +590,7 @@ func recvmsg(s int, msg *Msghdr, flags int) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_recvmsg)), 3, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags), 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -602,7 +601,7 @@ func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_sendmsg)), 3, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags), 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -612,7 +611,7 @@ func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) { func acct(path *byte) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procacct)), 1, uintptr(unsafe.Pointer(path)), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -647,7 +646,7 @@ func ioctlRet(fd int, req int, arg uintptr) (ret int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procioctl)), 3, uintptr(fd), uintptr(req), uintptr(arg), 0, 0, 0) ret = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -658,7 +657,7 @@ func ioctlPtrRet(fd int, req int, arg unsafe.Pointer) (ret int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procioctl)), 3, uintptr(fd), uintptr(req), uintptr(arg), 0, 0, 0) ret = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -669,7 +668,7 @@ func poll(fds *PollFd, nfds int, timeout int) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procpoll)), 3, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout), 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -684,7 +683,7 @@ func Access(path string, mode uint32) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procAccess)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -694,7 +693,7 @@ func Access(path string, mode uint32) (err error) { func Adjtime(delta *Timeval, olddelta *Timeval) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procAdjtime)), 2, uintptr(unsafe.Pointer(delta)), uintptr(unsafe.Pointer(olddelta)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -709,7 +708,7 @@ func Chdir(path string) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procChdir)), 1, uintptr(unsafe.Pointer(_p0)), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -724,7 +723,7 @@ func Chmod(path string, mode uint32) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procChmod)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -739,7 +738,7 @@ func Chown(path string, uid int, gid int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procChown)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -754,7 +753,7 @@ func Chroot(path string) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procChroot)), 1, uintptr(unsafe.Pointer(_p0)), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -764,7 +763,7 @@ func Chroot(path string) (err error) { func ClockGettime(clockid int32, time *Timespec) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procClockGettime)), 2, uintptr(clockid), uintptr(unsafe.Pointer(time)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -774,7 +773,7 @@ func ClockGettime(clockid int32, time *Timespec) (err error) { func Close(fd int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procClose)), 1, uintptr(fd), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -790,7 +789,7 @@ func Creat(path string, mode uint32) (fd int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procCreat)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0, 0) fd = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -801,7 +800,7 @@ func Dup(fd int) (nfd int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procDup)), 1, uintptr(fd), 0, 0, 0, 0, 0) nfd = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -811,7 +810,7 @@ func Dup(fd int) (nfd int, err error) { func Dup2(oldfd int, newfd int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procDup2)), 2, uintptr(oldfd), uintptr(newfd), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -833,7 +832,7 @@ func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFaccessat)), 4, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -843,7 +842,7 @@ func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) { func Fchdir(fd int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFchdir)), 1, uintptr(fd), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -853,7 +852,7 @@ func Fchdir(fd int) (err error) { func Fchmod(fd int, mode uint32) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFchmod)), 2, uintptr(fd), uintptr(mode), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -868,7 +867,7 @@ func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFchmodat)), 4, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -878,7 +877,7 @@ func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) { func Fchown(fd int, uid int, gid int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFchown)), 3, uintptr(fd), uintptr(uid), uintptr(gid), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -893,7 +892,7 @@ func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFchownat)), 5, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), uintptr(flags), 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -903,7 +902,7 @@ func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) { func Fdatasync(fd int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFdatasync)), 1, uintptr(fd), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -913,7 +912,7 @@ func Fdatasync(fd int) (err error) { func Flock(fd int, how int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFlock)), 2, uintptr(fd), uintptr(how), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -924,7 +923,7 @@ func Fpathconf(fd int, name int) (val int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFpathconf)), 2, uintptr(fd), uintptr(name), 0, 0, 0, 0) val = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -934,7 +933,7 @@ func Fpathconf(fd int, name int) (val int, err error) { func Fstat(fd int, stat *Stat_t) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFstat)), 2, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -949,7 +948,7 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFstatat)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -959,7 +958,7 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { func Fstatvfs(fd int, vfsstat *Statvfs_t) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFstatvfs)), 2, uintptr(fd), uintptr(unsafe.Pointer(vfsstat)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -974,7 +973,7 @@ func Getdents(fd int, buf []byte, basep *uintptr) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procGetdents)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(buf)), uintptr(unsafe.Pointer(basep)), 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1001,7 +1000,7 @@ func Getpgid(pid int) (pgid int, err error) { r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGetpgid)), 1, uintptr(pid), 0, 0, 0, 0, 0) pgid = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1012,7 +1011,7 @@ func Getpgrp() (pgid int, err error) { r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGetpgrp)), 0, 0, 0, 0, 0, 0, 0) pgid = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1047,7 +1046,7 @@ func Getpriority(which int, who int) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procGetpriority)), 2, uintptr(which), uintptr(who), 0, 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1057,7 +1056,7 @@ func Getpriority(which int, who int) (n int, err error) { func Getrlimit(which int, lim *Rlimit) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGetrlimit)), 2, uintptr(which), uintptr(unsafe.Pointer(lim)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1067,7 +1066,7 @@ func Getrlimit(which int, lim *Rlimit) (err error) { func Getrusage(who int, rusage *Rusage) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGetrusage)), 2, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1078,7 +1077,7 @@ func Getsid(pid int) (sid int, err error) { r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGetsid)), 1, uintptr(pid), 0, 0, 0, 0, 0) sid = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1088,7 +1087,7 @@ func Getsid(pid int) (sid int, err error) { func Gettimeofday(tv *Timeval) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGettimeofday)), 1, uintptr(unsafe.Pointer(tv)), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1106,7 +1105,7 @@ func Getuid() (uid int) { func Kill(pid int, signum syscall.Signal) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procKill)), 2, uintptr(pid), uintptr(signum), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1121,7 +1120,7 @@ func Lchown(path string, uid int, gid int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procLchown)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1141,7 +1140,7 @@ func Link(path string, link string) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procLink)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1151,7 +1150,7 @@ func Link(path string, link string) (err error) { func Listen(s int, backlog int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_llisten)), 2, uintptr(s), uintptr(backlog), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1166,7 +1165,7 @@ func Lstat(path string, stat *Stat_t) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procLstat)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1180,7 +1179,7 @@ func Madvise(b []byte, advice int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMadvise)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(len(b)), uintptr(advice), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1195,7 +1194,7 @@ func Mkdir(path string, mode uint32) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMkdir)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1210,7 +1209,7 @@ func Mkdirat(dirfd int, path string, mode uint32) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMkdirat)), 3, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1225,7 +1224,7 @@ func Mkfifo(path string, mode uint32) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMkfifo)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1240,7 +1239,7 @@ func Mkfifoat(dirfd int, path string, mode uint32) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMkfifoat)), 3, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1255,7 +1254,7 @@ func Mknod(path string, mode uint32, dev int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMknod)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1270,7 +1269,7 @@ func Mknodat(dirfd int, path string, mode uint32, dev int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMknodat)), 4, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1284,7 +1283,7 @@ func Mlock(b []byte) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMlock)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(len(b)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1294,7 +1293,7 @@ func Mlock(b []byte) (err error) { func Mlockall(flags int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMlockall)), 1, uintptr(flags), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1308,7 +1307,7 @@ func Mprotect(b []byte, prot int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMprotect)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(len(b)), uintptr(prot), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1322,7 +1321,7 @@ func Msync(b []byte, flags int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMsync)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(len(b)), uintptr(flags), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1336,7 +1335,7 @@ func Munlock(b []byte) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMunlock)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(len(b)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1346,7 +1345,7 @@ func Munlock(b []byte) (err error) { func Munlockall() (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procMunlockall)), 0, 0, 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1356,7 +1355,7 @@ func Munlockall() (err error) { func Nanosleep(time *Timespec, leftover *Timespec) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procNanosleep)), 2, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1372,7 +1371,7 @@ func Open(path string, mode int, perm uint32) (fd int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procOpen)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm), 0, 0, 0) fd = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1388,7 +1387,7 @@ func Openat(dirfd int, path string, flags int, mode uint32) (fd int, err error) r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procOpenat)), 4, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags), uintptr(mode), 0, 0) fd = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1404,7 +1403,7 @@ func Pathconf(path string, name int) (val int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procPathconf)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(name), 0, 0, 0, 0) val = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1414,7 +1413,7 @@ func Pathconf(path string, name int) (val int, err error) { func Pause() (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procPause)), 0, 0, 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1429,7 +1428,7 @@ func pread(fd int, p []byte, offset int64) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procpread)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), uintptr(offset), 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1444,7 +1443,7 @@ func pwrite(fd int, p []byte, offset int64) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procpwrite)), 4, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), uintptr(offset), 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1459,7 +1458,7 @@ func read(fd int, p []byte) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procread)), 3, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1479,7 +1478,7 @@ func Readlink(path string, buf []byte) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procReadlink)), 3, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(len(buf)), 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1499,7 +1498,7 @@ func Rename(from string, to string) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procRename)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1519,7 +1518,7 @@ func Renameat(olddirfd int, oldpath string, newdirfd int, newpath string) (err e } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procRenameat)), 4, uintptr(olddirfd), uintptr(unsafe.Pointer(_p0)), uintptr(newdirfd), uintptr(unsafe.Pointer(_p1)), 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1534,7 +1533,7 @@ func Rmdir(path string) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procRmdir)), 1, uintptr(unsafe.Pointer(_p0)), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1545,7 +1544,7 @@ func Seek(fd int, offset int64, whence int) (newoffset int64, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proclseek)), 3, uintptr(fd), uintptr(offset), uintptr(whence), 0, 0, 0) newoffset = int64(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1556,7 +1555,7 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSelect)), 5, uintptr(nfd), uintptr(unsafe.Pointer(r)), uintptr(unsafe.Pointer(w)), uintptr(unsafe.Pointer(e)), uintptr(unsafe.Pointer(timeout)), 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1566,7 +1565,7 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err func Setegid(egid int) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetegid)), 1, uintptr(egid), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1576,7 +1575,7 @@ func Setegid(egid int) (err error) { func Seteuid(euid int) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSeteuid)), 1, uintptr(euid), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1586,7 +1585,7 @@ func Seteuid(euid int) (err error) { func Setgid(gid int) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetgid)), 1, uintptr(gid), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1600,7 +1599,7 @@ func Sethostname(p []byte) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSethostname)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1610,7 +1609,7 @@ func Sethostname(p []byte) (err error) { func Setpgid(pid int, pgid int) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetpgid)), 2, uintptr(pid), uintptr(pgid), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1620,7 +1619,7 @@ func Setpgid(pid int, pgid int) (err error) { func Setpriority(which int, who int, prio int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSetpriority)), 3, uintptr(which), uintptr(who), uintptr(prio), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1630,7 +1629,7 @@ func Setpriority(which int, who int, prio int) (err error) { func Setregid(rgid int, egid int) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetregid)), 2, uintptr(rgid), uintptr(egid), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1640,7 +1639,7 @@ func Setregid(rgid int, egid int) (err error) { func Setreuid(ruid int, euid int) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetreuid)), 2, uintptr(ruid), uintptr(euid), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1651,7 +1650,7 @@ func Setsid() (pid int, err error) { r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetsid)), 0, 0, 0, 0, 0, 0, 0) pid = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1661,7 +1660,7 @@ func Setsid() (pid int, err error) { func Setuid(uid int) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procSetuid)), 1, uintptr(uid), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1671,7 +1670,7 @@ func Setuid(uid int) (err error) { func Shutdown(s int, how int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procshutdown)), 2, uintptr(s), uintptr(how), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1686,7 +1685,7 @@ func Stat(path string, stat *Stat_t) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procStat)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1701,7 +1700,7 @@ func Statvfs(path string, vfsstat *Statvfs_t) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procStatvfs)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(vfsstat)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1721,7 +1720,7 @@ func Symlink(path string, link string) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSymlink)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1731,7 +1730,7 @@ func Symlink(path string, link string) (err error) { func Sync() (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSync)), 0, 0, 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1742,7 +1741,7 @@ func Sysconf(which int) (n int64, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procSysconf)), 1, uintptr(which), 0, 0, 0, 0, 0) n = int64(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1753,7 +1752,7 @@ func Times(tms *Tms) (ticks uintptr, err error) { r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procTimes)), 1, uintptr(unsafe.Pointer(tms)), 0, 0, 0, 0, 0) ticks = uintptr(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1768,7 +1767,7 @@ func Truncate(path string, length int64) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procTruncate)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(length), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1778,7 +1777,7 @@ func Truncate(path string, length int64) (err error) { func Fsync(fd int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFsync)), 1, uintptr(fd), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1788,7 +1787,7 @@ func Fsync(fd int) (err error) { func Ftruncate(fd int, length int64) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procFtruncate)), 2, uintptr(fd), uintptr(length), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1806,7 +1805,7 @@ func Umask(mask int) (oldmask int) { func Uname(buf *Utsname) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procUname)), 1, uintptr(unsafe.Pointer(buf)), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1821,7 +1820,7 @@ func Unmount(target string, flags int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procumount)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1836,7 +1835,7 @@ func Unlink(path string) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procUnlink)), 1, uintptr(unsafe.Pointer(_p0)), 0, 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1851,7 +1850,7 @@ func Unlinkat(dirfd int, path string, flags int) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procUnlinkat)), 3, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1861,7 +1860,7 @@ func Unlinkat(dirfd int, path string, flags int) (err error) { func Ustat(dev int, ubuf *Ustat_t) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procUstat)), 2, uintptr(dev), uintptr(unsafe.Pointer(ubuf)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1876,7 +1875,7 @@ func Utime(path string, buf *Utimbuf) (err error) { } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procUtime)), 2, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(buf)), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1886,7 +1885,7 @@ func Utime(path string, buf *Utimbuf) (err error) { func bind(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_bind)), 3, uintptr(s), uintptr(addr), uintptr(addrlen), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1896,7 +1895,7 @@ func bind(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) { func connect(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_connect)), 3, uintptr(s), uintptr(addr), uintptr(addrlen), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1907,7 +1906,7 @@ func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) ( r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procmmap)), 6, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flag), uintptr(fd), uintptr(pos)) ret = uintptr(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1917,7 +1916,7 @@ func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) ( func munmap(addr uintptr, length uintptr) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procmunmap)), 2, uintptr(addr), uintptr(length), 0, 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1928,7 +1927,7 @@ func sendfile(outfd int, infd int, offset *int64, count int) (written int, err e r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procsendfile)), 4, uintptr(outfd), uintptr(infd), uintptr(unsafe.Pointer(offset)), uintptr(count), 0, 0) written = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1942,7 +1941,7 @@ func sendto(s int, buf []byte, flags int, to unsafe.Pointer, addrlen _Socklen) ( } _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_sendto)), 6, uintptr(s), uintptr(unsafe.Pointer(_p0)), uintptr(len(buf)), uintptr(flags), uintptr(to), uintptr(addrlen)) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1953,7 +1952,7 @@ func socket(domain int, typ int, proto int) (fd int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_socket)), 3, uintptr(domain), uintptr(typ), uintptr(proto), 0, 0, 0) fd = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1963,7 +1962,7 @@ func socket(domain int, typ int, proto int) (fd int, err error) { func socketpair(domain int, typ int, proto int, fd *[2]int32) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&proc__xnet_socketpair)), 4, uintptr(domain), uintptr(typ), uintptr(proto), uintptr(unsafe.Pointer(fd)), 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1978,7 +1977,7 @@ func write(fd int, p []byte) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procwrite)), 3, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1988,7 +1987,7 @@ func write(fd int, p []byte) (n int, err error) { func getsockopt(s int, level int, name int, val unsafe.Pointer, vallen *_Socklen) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&proc__xnet_getsockopt)), 5, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(unsafe.Pointer(vallen)), 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -1998,7 +1997,7 @@ func getsockopt(s int, level int, name int, val unsafe.Pointer, vallen *_Socklen func getpeername(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procgetpeername)), 3, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), 0, 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -2008,7 +2007,7 @@ func getpeername(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) { func setsockopt(s int, level int, name int, val unsafe.Pointer, vallen uintptr) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procsetsockopt)), 5, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(vallen), 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -2023,7 +2022,7 @@ func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Sockl r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procrecvfrom)), 6, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(len(p)), uintptr(flags), uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(fromlen))) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -2034,7 +2033,7 @@ func port_create() (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procport_create)), 0, 0, 0, 0, 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -2045,7 +2044,7 @@ func port_associate(port int, source int, object uintptr, events int, user *byte r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procport_associate)), 5, uintptr(port), uintptr(source), uintptr(object), uintptr(events), uintptr(unsafe.Pointer(user)), 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -2056,7 +2055,7 @@ func port_dissociate(port int, source int, object uintptr) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procport_dissociate)), 3, uintptr(port), uintptr(source), uintptr(object), 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -2067,7 +2066,7 @@ func port_get(port int, pe *portEvent, timeout *Timespec) (n int, err error) { r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procport_get)), 3, uintptr(port), uintptr(unsafe.Pointer(pe)), uintptr(unsafe.Pointer(timeout)), 0, 0, 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -2078,7 +2077,7 @@ func port_getn(port int, pe *portEvent, max uint32, nget *uint32, timeout *Times r0, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procport_getn)), 5, uintptr(port), uintptr(unsafe.Pointer(pe)), uintptr(max), uintptr(unsafe.Pointer(nget)), uintptr(unsafe.Pointer(timeout)), 0) n = int(r0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -2088,7 +2087,7 @@ func port_getn(port int, pe *portEvent, max uint32, nget *uint32, timeout *Times func putmsg(fd int, clptr *strbuf, dataptr *strbuf, flags int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procputmsg)), 4, uintptr(fd), uintptr(unsafe.Pointer(clptr)), uintptr(unsafe.Pointer(dataptr)), uintptr(flags), 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } @@ -2098,7 +2097,7 @@ func putmsg(fd int, clptr *strbuf, dataptr *strbuf, flags int) (err error) { func getmsg(fd int, clptr *strbuf, dataptr *strbuf, flags *int) (err error) { _, _, e1 := sysvicall6(uintptr(unsafe.Pointer(&procgetmsg)), 4, uintptr(fd), uintptr(unsafe.Pointer(clptr)), uintptr(unsafe.Pointer(dataptr)), uintptr(unsafe.Pointer(flags)), 0, 0) if e1 != 0 { - err = e1 + err = errnoErr(e1) } return } diff --git a/vendor/golang.org/x/sys/unix/zsyscall_zos_s390x.go b/vendor/golang.org/x/sys/unix/zsyscall_zos_s390x.go index c31681743..94f011238 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_zos_s390x.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_zos_s390x.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build zos && s390x -// +build zos,s390x package unix @@ -40,17 +39,6 @@ func read(fd int, p []byte) (n int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) - n = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func write(fd int, p []byte) (n int, err error) { var _p0 unsafe.Pointer if len(p) > 0 { diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_386.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_386.go index 55e048471..3a58ae819 100644 --- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; DO NOT EDIT. //go:build 386 && openbsd -// +build 386,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_amd64.go index d2243cf83..dcb7a0eb7 100644 --- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; DO NOT EDIT. //go:build amd64 && openbsd -// +build amd64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm.go index 82dc51bd8..db5a7bf13 100644 --- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; DO NOT EDIT. //go:build arm && openbsd -// +build arm,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm64.go index cbdda1a4a..7be575a77 100644 --- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; DO NOT EDIT. //go:build arm64 && openbsd -// +build arm64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_mips64.go index f55eae1a8..d6e3174c6 100644 --- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_mips64.go +++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_mips64.go @@ -2,7 +2,6 @@ // Code generated by the command above; DO NOT EDIT. //go:build mips64 && openbsd -// +build mips64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_ppc64.go index e44054470..ee97157d0 100644 --- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; DO NOT EDIT. //go:build ppc64 && openbsd -// +build ppc64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_riscv64.go index a0db82fce..35c3b91d0 100644 --- a/vendor/golang.org/x/sys/unix/zsysctl_openbsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zsysctl_openbsd_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; DO NOT EDIT. //go:build riscv64 && openbsd -// +build riscv64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_darwin_amd64.go index f8298ff9b..5edda7687 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_darwin_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && darwin -// +build amd64,darwin package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_darwin_arm64.go index 5eb433bbf..0dc9e8b4d 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_darwin_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && darwin -// +build arm64,darwin package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_dragonfly_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_dragonfly_amd64.go index 703675c0c..308ddf3a1 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_dragonfly_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_dragonfly_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && dragonfly -// +build amd64,dragonfly package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go index 4e0d96107..418664e3d 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && freebsd -// +build 386,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go index 01636b838..34d0b86d7 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && freebsd -// +build amd64,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go index ad99bc106..b71cf45e2 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && freebsd -// +build arm,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go index 89dcc4274..e32df1c1e 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && freebsd -// +build arm64,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_riscv64.go index ee37aaa0c..15ad6111f 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && freebsd -// +build riscv64,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go index c9c4ad031..0cc3ce496 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && linux -// +build 386,linux package unix @@ -447,4 +446,10 @@ const ( SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 + SYS_CACHESTAT = 451 + SYS_FCHMODAT2 = 452 + SYS_MAP_SHADOW_STACK = 453 + SYS_FUTEX_WAKE = 454 + SYS_FUTEX_WAIT = 455 + SYS_FUTEX_REQUEUE = 456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go index 12ff3417c..856d92d69 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && linux -// +build amd64,linux package unix @@ -369,4 +368,10 @@ const ( SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 + SYS_CACHESTAT = 451 + SYS_FCHMODAT2 = 452 + SYS_MAP_SHADOW_STACK = 453 + SYS_FUTEX_WAKE = 454 + SYS_FUTEX_WAIT = 455 + SYS_FUTEX_REQUEUE = 456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go index c3fb5e77a..8d467094c 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && linux -// +build arm,linux package unix @@ -411,4 +410,10 @@ const ( SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 + SYS_CACHESTAT = 451 + SYS_FCHMODAT2 = 452 + SYS_MAP_SHADOW_STACK = 453 + SYS_FUTEX_WAKE = 454 + SYS_FUTEX_WAIT = 455 + SYS_FUTEX_REQUEUE = 456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go index 358c847a4..edc173244 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && linux -// +build arm64,linux package unix @@ -314,4 +313,10 @@ const ( SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 + SYS_CACHESTAT = 451 + SYS_FCHMODAT2 = 452 + SYS_MAP_SHADOW_STACK = 453 + SYS_FUTEX_WAKE = 454 + SYS_FUTEX_WAIT = 455 + SYS_FUTEX_REQUEUE = 456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go index 81c4849b1..445eba206 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_loong64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build loong64 && linux -// +build loong64,linux package unix @@ -308,4 +307,10 @@ const ( SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 + SYS_CACHESTAT = 451 + SYS_FCHMODAT2 = 452 + SYS_MAP_SHADOW_STACK = 453 + SYS_FUTEX_WAKE = 454 + SYS_FUTEX_WAIT = 455 + SYS_FUTEX_REQUEUE = 456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go index 202a57e90..adba01bca 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips && linux -// +build mips,linux package unix @@ -431,4 +430,10 @@ const ( SYS_PROCESS_MRELEASE = 4448 SYS_FUTEX_WAITV = 4449 SYS_SET_MEMPOLICY_HOME_NODE = 4450 + SYS_CACHESTAT = 4451 + SYS_FCHMODAT2 = 4452 + SYS_MAP_SHADOW_STACK = 4453 + SYS_FUTEX_WAKE = 4454 + SYS_FUTEX_WAIT = 4455 + SYS_FUTEX_REQUEUE = 4456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go index 1fbceb52d..014c4e9c7 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64 && linux -// +build mips64,linux package unix @@ -361,4 +360,10 @@ const ( SYS_PROCESS_MRELEASE = 5448 SYS_FUTEX_WAITV = 5449 SYS_SET_MEMPOLICY_HOME_NODE = 5450 + SYS_CACHESTAT = 5451 + SYS_FCHMODAT2 = 5452 + SYS_MAP_SHADOW_STACK = 5453 + SYS_FUTEX_WAKE = 5454 + SYS_FUTEX_WAIT = 5455 + SYS_FUTEX_REQUEUE = 5456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go index b4ffb7a20..ccc97d74d 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64le && linux -// +build mips64le,linux package unix @@ -361,4 +360,10 @@ const ( SYS_PROCESS_MRELEASE = 5448 SYS_FUTEX_WAITV = 5449 SYS_SET_MEMPOLICY_HOME_NODE = 5450 + SYS_CACHESTAT = 5451 + SYS_FCHMODAT2 = 5452 + SYS_MAP_SHADOW_STACK = 5453 + SYS_FUTEX_WAKE = 5454 + SYS_FUTEX_WAIT = 5455 + SYS_FUTEX_REQUEUE = 5456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go index 867985f9b..ec2b64a95 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mipsle && linux -// +build mipsle,linux package unix @@ -431,4 +430,10 @@ const ( SYS_PROCESS_MRELEASE = 4448 SYS_FUTEX_WAITV = 4449 SYS_SET_MEMPOLICY_HOME_NODE = 4450 + SYS_CACHESTAT = 4451 + SYS_FCHMODAT2 = 4452 + SYS_MAP_SHADOW_STACK = 4453 + SYS_FUTEX_WAKE = 4454 + SYS_FUTEX_WAIT = 4455 + SYS_FUTEX_REQUEUE = 4456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go index a8cce69ed..21a839e33 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc && linux -// +build ppc,linux package unix @@ -438,4 +437,10 @@ const ( SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 + SYS_CACHESTAT = 451 + SYS_FCHMODAT2 = 452 + SYS_MAP_SHADOW_STACK = 453 + SYS_FUTEX_WAKE = 454 + SYS_FUTEX_WAIT = 455 + SYS_FUTEX_REQUEUE = 456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go index d44c5b39d..c11121ec3 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64 && linux -// +build ppc64,linux package unix @@ -410,4 +409,10 @@ const ( SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 + SYS_CACHESTAT = 451 + SYS_FCHMODAT2 = 452 + SYS_MAP_SHADOW_STACK = 453 + SYS_FUTEX_WAKE = 454 + SYS_FUTEX_WAIT = 455 + SYS_FUTEX_REQUEUE = 456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go index 4214dd9c0..909b631fc 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64le && linux -// +build ppc64le,linux package unix @@ -410,4 +409,10 @@ const ( SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 + SYS_CACHESTAT = 451 + SYS_FCHMODAT2 = 452 + SYS_MAP_SHADOW_STACK = 453 + SYS_FUTEX_WAKE = 454 + SYS_FUTEX_WAIT = 455 + SYS_FUTEX_REQUEUE = 456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go index ef285c567..e49bed16e 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && linux -// +build riscv64,linux package unix @@ -315,4 +314,10 @@ const ( SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 + SYS_CACHESTAT = 451 + SYS_FCHMODAT2 = 452 + SYS_MAP_SHADOW_STACK = 453 + SYS_FUTEX_WAKE = 454 + SYS_FUTEX_WAIT = 455 + SYS_FUTEX_REQUEUE = 456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go index e6ed7d637..66017d2d3 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build s390x && linux -// +build s390x,linux package unix @@ -376,4 +375,10 @@ const ( SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 + SYS_CACHESTAT = 451 + SYS_FCHMODAT2 = 452 + SYS_MAP_SHADOW_STACK = 453 + SYS_FUTEX_WAKE = 454 + SYS_FUTEX_WAIT = 455 + SYS_FUTEX_REQUEUE = 456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go index 92f628ef4..47bab18dc 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build sparc64 && linux -// +build sparc64,linux package unix @@ -389,4 +388,10 @@ const ( SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 + SYS_CACHESTAT = 451 + SYS_FCHMODAT2 = 452 + SYS_MAP_SHADOW_STACK = 453 + SYS_FUTEX_WAKE = 454 + SYS_FUTEX_WAIT = 455 + SYS_FUTEX_REQUEUE = 456 ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_386.go b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_386.go index 3a6699eba..b2aa8cd49 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && netbsd -// +build 386,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_amd64.go index 5677cd4f1..524a1b1c9 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && netbsd -// +build amd64,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm.go index e784cb6db..d59b943ac 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && netbsd -// +build arm,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm64.go index bd4952efa..31e771d53 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_netbsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; DO NOT EDIT. //go:build arm64 && netbsd -// +build arm64,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go index 597733813..9fd77c6cb 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && openbsd -// +build 386,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go index 16af29189..af10af28c 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && openbsd -// +build amd64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go index f59b18a97..cc2028af4 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && openbsd -// +build arm,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go index 721ef5910..c06dd4415 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && openbsd -// +build arm64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_mips64.go index 01c43a01f..9ddbf3e08 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_mips64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_mips64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64 && openbsd -// +build mips64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_ppc64.go index f258cfa24..19a6ee413 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64 && openbsd -// +build ppc64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_riscv64.go index 07919e0ec..05192a782 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && openbsd -// +build riscv64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/zsysnum_zos_s390x.go b/vendor/golang.org/x/sys/unix/zsysnum_zos_s390x.go index 073daad43..b2e308581 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_zos_s390x.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_zos_s390x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build zos && s390x -// +build zos,s390x package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_aix_ppc.go b/vendor/golang.org/x/sys/unix/ztypes_aix_ppc.go index 7a8161c1d..3e6d57cae 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_aix_ppc.go +++ b/vendor/golang.org/x/sys/unix/ztypes_aix_ppc.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc && aix -// +build ppc,aix package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_aix_ppc64.go b/vendor/golang.org/x/sys/unix/ztypes_aix_ppc64.go index 07ed733c5..3a219bdce 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_aix_ppc64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_aix_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64 && aix -// +build ppc64,aix package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go index 690cefc3d..091d107f3 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && darwin -// +build amd64,darwin package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go index 5bffc10ea..28ff4ef74 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && darwin -// +build arm64,darwin package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_dragonfly_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_dragonfly_amd64.go index d0ba8e9b8..30e405bb4 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_dragonfly_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_dragonfly_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && dragonfly -// +build amd64,dragonfly package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go index 29dc48337..6cbd094a3 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && freebsd -// +build 386,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go index 0a89b2890..7c03b6ee7 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && freebsd -// +build amd64,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go index c8666bb15..422107ee8 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && freebsd -// +build arm,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go index 88fb48a88..505a12acf 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && freebsd -// +build arm64,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go index 698dc975e..cc986c790 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && freebsd -// +build riscv64,freebsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux.go b/vendor/golang.org/x/sys/unix/ztypes_linux.go index 26ef52aaf..eff6bcdef 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux.go @@ -1,7 +1,6 @@ // Code generated by mkmerge; DO NOT EDIT. //go:build linux -// +build linux package unix @@ -175,7 +174,8 @@ type FscryptPolicyV2 struct { Contents_encryption_mode uint8 Filenames_encryption_mode uint8 Flags uint8 - _ [4]uint8 + Log2_data_unit_size uint8 + _ [3]uint8 Master_key_identifier [16]uint8 } @@ -456,60 +456,63 @@ type Ucred struct { } type TCPInfo struct { - State uint8 - Ca_state uint8 - Retransmits uint8 - Probes uint8 - Backoff uint8 - Options uint8 - Rto uint32 - Ato uint32 - Snd_mss uint32 - Rcv_mss uint32 - Unacked uint32 - Sacked uint32 - Lost uint32 - Retrans uint32 - Fackets uint32 - Last_data_sent uint32 - Last_ack_sent uint32 - Last_data_recv uint32 - Last_ack_recv uint32 - Pmtu uint32 - Rcv_ssthresh uint32 - Rtt uint32 - Rttvar uint32 - Snd_ssthresh uint32 - Snd_cwnd uint32 - Advmss uint32 - Reordering uint32 - Rcv_rtt uint32 - Rcv_space uint32 - Total_retrans uint32 - Pacing_rate uint64 - Max_pacing_rate uint64 - Bytes_acked uint64 - Bytes_received uint64 - Segs_out uint32 - Segs_in uint32 - Notsent_bytes uint32 - Min_rtt uint32 - Data_segs_in uint32 - Data_segs_out uint32 - Delivery_rate uint64 - Busy_time uint64 - Rwnd_limited uint64 - Sndbuf_limited uint64 - Delivered uint32 - Delivered_ce uint32 - Bytes_sent uint64 - Bytes_retrans uint64 - Dsack_dups uint32 - Reord_seen uint32 - Rcv_ooopack uint32 - Snd_wnd uint32 - Rcv_wnd uint32 - Rehash uint32 + State uint8 + Ca_state uint8 + Retransmits uint8 + Probes uint8 + Backoff uint8 + Options uint8 + Rto uint32 + Ato uint32 + Snd_mss uint32 + Rcv_mss uint32 + Unacked uint32 + Sacked uint32 + Lost uint32 + Retrans uint32 + Fackets uint32 + Last_data_sent uint32 + Last_ack_sent uint32 + Last_data_recv uint32 + Last_ack_recv uint32 + Pmtu uint32 + Rcv_ssthresh uint32 + Rtt uint32 + Rttvar uint32 + Snd_ssthresh uint32 + Snd_cwnd uint32 + Advmss uint32 + Reordering uint32 + Rcv_rtt uint32 + Rcv_space uint32 + Total_retrans uint32 + Pacing_rate uint64 + Max_pacing_rate uint64 + Bytes_acked uint64 + Bytes_received uint64 + Segs_out uint32 + Segs_in uint32 + Notsent_bytes uint32 + Min_rtt uint32 + Data_segs_in uint32 + Data_segs_out uint32 + Delivery_rate uint64 + Busy_time uint64 + Rwnd_limited uint64 + Sndbuf_limited uint64 + Delivered uint32 + Delivered_ce uint32 + Bytes_sent uint64 + Bytes_retrans uint64 + Dsack_dups uint32 + Reord_seen uint32 + Rcv_ooopack uint32 + Snd_wnd uint32 + Rcv_wnd uint32 + Rehash uint32 + Total_rto uint16 + Total_rto_recoveries uint16 + Total_rto_time uint32 } type CanFilter struct { @@ -552,7 +555,7 @@ const ( SizeofIPv6MTUInfo = 0x20 SizeofICMPv6Filter = 0x20 SizeofUcred = 0xc - SizeofTCPInfo = 0xf0 + SizeofTCPInfo = 0xf8 SizeofCanFilter = 0x8 SizeofTCPRepairOpt = 0x8 ) @@ -833,6 +836,15 @@ const ( FSPICK_EMPTY_PATH = 0x8 FSMOUNT_CLOEXEC = 0x1 + + FSCONFIG_SET_FLAG = 0x0 + FSCONFIG_SET_STRING = 0x1 + FSCONFIG_SET_BINARY = 0x2 + FSCONFIG_SET_PATH = 0x3 + FSCONFIG_SET_PATH_EMPTY = 0x4 + FSCONFIG_SET_FD = 0x5 + FSCONFIG_CMD_CREATE = 0x6 + FSCONFIG_CMD_RECONFIGURE = 0x7 ) type OpenHow struct { @@ -1547,6 +1559,7 @@ const ( IFLA_DEVLINK_PORT = 0x3e IFLA_GSO_IPV4_MAX_SIZE = 0x3f IFLA_GRO_IPV4_MAX_SIZE = 0x40 + IFLA_DPLL_PIN = 0x41 IFLA_PROTO_DOWN_REASON_UNSPEC = 0x0 IFLA_PROTO_DOWN_REASON_MASK = 0x1 IFLA_PROTO_DOWN_REASON_VALUE = 0x2 @@ -1562,6 +1575,7 @@ const ( IFLA_INET6_ICMP6STATS = 0x6 IFLA_INET6_TOKEN = 0x7 IFLA_INET6_ADDR_GEN_MODE = 0x8 + IFLA_INET6_RA_MTU = 0x9 IFLA_BR_UNSPEC = 0x0 IFLA_BR_FORWARD_DELAY = 0x1 IFLA_BR_HELLO_TIME = 0x2 @@ -1609,6 +1623,9 @@ const ( IFLA_BR_MCAST_MLD_VERSION = 0x2c IFLA_BR_VLAN_STATS_PER_PORT = 0x2d IFLA_BR_MULTI_BOOLOPT = 0x2e + IFLA_BR_MCAST_QUERIER_STATE = 0x2f + IFLA_BR_FDB_N_LEARNED = 0x30 + IFLA_BR_FDB_MAX_LEARNED = 0x31 IFLA_BRPORT_UNSPEC = 0x0 IFLA_BRPORT_STATE = 0x1 IFLA_BRPORT_PRIORITY = 0x2 @@ -1646,6 +1663,14 @@ const ( IFLA_BRPORT_BACKUP_PORT = 0x22 IFLA_BRPORT_MRP_RING_OPEN = 0x23 IFLA_BRPORT_MRP_IN_OPEN = 0x24 + IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT = 0x25 + IFLA_BRPORT_MCAST_EHT_HOSTS_CNT = 0x26 + IFLA_BRPORT_LOCKED = 0x27 + IFLA_BRPORT_MAB = 0x28 + IFLA_BRPORT_MCAST_N_GROUPS = 0x29 + IFLA_BRPORT_MCAST_MAX_GROUPS = 0x2a + IFLA_BRPORT_NEIGH_VLAN_SUPPRESS = 0x2b + IFLA_BRPORT_BACKUP_NHID = 0x2c IFLA_INFO_UNSPEC = 0x0 IFLA_INFO_KIND = 0x1 IFLA_INFO_DATA = 0x2 @@ -1667,6 +1692,9 @@ const ( IFLA_MACVLAN_MACADDR = 0x4 IFLA_MACVLAN_MACADDR_DATA = 0x5 IFLA_MACVLAN_MACADDR_COUNT = 0x6 + IFLA_MACVLAN_BC_QUEUE_LEN = 0x7 + IFLA_MACVLAN_BC_QUEUE_LEN_USED = 0x8 + IFLA_MACVLAN_BC_CUTOFF = 0x9 IFLA_VRF_UNSPEC = 0x0 IFLA_VRF_TABLE = 0x1 IFLA_VRF_PORT_UNSPEC = 0x0 @@ -1690,9 +1718,22 @@ const ( IFLA_XFRM_UNSPEC = 0x0 IFLA_XFRM_LINK = 0x1 IFLA_XFRM_IF_ID = 0x2 + IFLA_XFRM_COLLECT_METADATA = 0x3 IFLA_IPVLAN_UNSPEC = 0x0 IFLA_IPVLAN_MODE = 0x1 IFLA_IPVLAN_FLAGS = 0x2 + NETKIT_NEXT = -0x1 + NETKIT_PASS = 0x0 + NETKIT_DROP = 0x2 + NETKIT_REDIRECT = 0x7 + NETKIT_L2 = 0x0 + NETKIT_L3 = 0x1 + IFLA_NETKIT_UNSPEC = 0x0 + IFLA_NETKIT_PEER_INFO = 0x1 + IFLA_NETKIT_PRIMARY = 0x2 + IFLA_NETKIT_POLICY = 0x3 + IFLA_NETKIT_PEER_POLICY = 0x4 + IFLA_NETKIT_MODE = 0x5 IFLA_VXLAN_UNSPEC = 0x0 IFLA_VXLAN_ID = 0x1 IFLA_VXLAN_GROUP = 0x2 @@ -1723,6 +1764,8 @@ const ( IFLA_VXLAN_GPE = 0x1b IFLA_VXLAN_TTL_INHERIT = 0x1c IFLA_VXLAN_DF = 0x1d + IFLA_VXLAN_VNIFILTER = 0x1e + IFLA_VXLAN_LOCALBYPASS = 0x1f IFLA_GENEVE_UNSPEC = 0x0 IFLA_GENEVE_ID = 0x1 IFLA_GENEVE_REMOTE = 0x2 @@ -1737,6 +1780,7 @@ const ( IFLA_GENEVE_LABEL = 0xb IFLA_GENEVE_TTL_INHERIT = 0xc IFLA_GENEVE_DF = 0xd + IFLA_GENEVE_INNER_PROTO_INHERIT = 0xe IFLA_BAREUDP_UNSPEC = 0x0 IFLA_BAREUDP_PORT = 0x1 IFLA_BAREUDP_ETHERTYPE = 0x2 @@ -1749,6 +1793,8 @@ const ( IFLA_GTP_FD1 = 0x2 IFLA_GTP_PDP_HASHSIZE = 0x3 IFLA_GTP_ROLE = 0x4 + IFLA_GTP_CREATE_SOCKETS = 0x5 + IFLA_GTP_RESTART_COUNT = 0x6 IFLA_BOND_UNSPEC = 0x0 IFLA_BOND_MODE = 0x1 IFLA_BOND_ACTIVE_SLAVE = 0x2 @@ -1778,6 +1824,9 @@ const ( IFLA_BOND_AD_ACTOR_SYSTEM = 0x1a IFLA_BOND_TLB_DYNAMIC_LB = 0x1b IFLA_BOND_PEER_NOTIF_DELAY = 0x1c + IFLA_BOND_AD_LACP_ACTIVE = 0x1d + IFLA_BOND_MISSED_MAX = 0x1e + IFLA_BOND_NS_IP6_TARGET = 0x1f IFLA_BOND_AD_INFO_UNSPEC = 0x0 IFLA_BOND_AD_INFO_AGGREGATOR = 0x1 IFLA_BOND_AD_INFO_NUM_PORTS = 0x2 @@ -1793,6 +1842,7 @@ const ( IFLA_BOND_SLAVE_AD_AGGREGATOR_ID = 0x6 IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE = 0x7 IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE = 0x8 + IFLA_BOND_SLAVE_PRIO = 0x9 IFLA_VF_INFO_UNSPEC = 0x0 IFLA_VF_INFO = 0x1 IFLA_VF_UNSPEC = 0x0 @@ -1851,8 +1901,16 @@ const ( IFLA_STATS_LINK_XSTATS_SLAVE = 0x3 IFLA_STATS_LINK_OFFLOAD_XSTATS = 0x4 IFLA_STATS_AF_SPEC = 0x5 + IFLA_STATS_GETSET_UNSPEC = 0x0 + IFLA_STATS_GET_FILTERS = 0x1 + IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS = 0x2 IFLA_OFFLOAD_XSTATS_UNSPEC = 0x0 IFLA_OFFLOAD_XSTATS_CPU_HIT = 0x1 + IFLA_OFFLOAD_XSTATS_HW_S_INFO = 0x2 + IFLA_OFFLOAD_XSTATS_L3_STATS = 0x3 + IFLA_OFFLOAD_XSTATS_HW_S_INFO_UNSPEC = 0x0 + IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST = 0x1 + IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED = 0x2 IFLA_XDP_UNSPEC = 0x0 IFLA_XDP_FD = 0x1 IFLA_XDP_ATTACHED = 0x2 @@ -1882,6 +1940,11 @@ const ( IFLA_RMNET_UNSPEC = 0x0 IFLA_RMNET_MUX_ID = 0x1 IFLA_RMNET_FLAGS = 0x2 + IFLA_MCTP_UNSPEC = 0x0 + IFLA_MCTP_NET = 0x1 + IFLA_DSA_UNSPEC = 0x0 + IFLA_DSA_CONDUIT = 0x1 + IFLA_DSA_MASTER = 0x1 ) const ( @@ -1977,7 +2040,7 @@ const ( NFT_MSG_GETFLOWTABLE = 0x17 NFT_MSG_DELFLOWTABLE = 0x18 NFT_MSG_GETRULE_RESET = 0x19 - NFT_MSG_MAX = 0x21 + NFT_MSG_MAX = 0x22 NFTA_LIST_UNSPEC = 0x0 NFTA_LIST_ELEM = 0x1 NFTA_HOOK_UNSPEC = 0x0 @@ -2672,6 +2735,7 @@ const ( BPF_PROG_TYPE_LSM = 0x1d BPF_PROG_TYPE_SK_LOOKUP = 0x1e BPF_PROG_TYPE_SYSCALL = 0x1f + BPF_PROG_TYPE_NETFILTER = 0x20 BPF_CGROUP_INET_INGRESS = 0x0 BPF_CGROUP_INET_EGRESS = 0x1 BPF_CGROUP_INET_SOCK_CREATE = 0x2 @@ -2716,6 +2780,11 @@ const ( BPF_PERF_EVENT = 0x29 BPF_TRACE_KPROBE_MULTI = 0x2a BPF_LSM_CGROUP = 0x2b + BPF_STRUCT_OPS = 0x2c + BPF_NETFILTER = 0x2d + BPF_TCX_INGRESS = 0x2e + BPF_TCX_EGRESS = 0x2f + BPF_TRACE_UPROBE_MULTI = 0x30 BPF_LINK_TYPE_UNSPEC = 0x0 BPF_LINK_TYPE_RAW_TRACEPOINT = 0x1 BPF_LINK_TYPE_TRACING = 0x2 @@ -2726,6 +2795,18 @@ const ( BPF_LINK_TYPE_PERF_EVENT = 0x7 BPF_LINK_TYPE_KPROBE_MULTI = 0x8 BPF_LINK_TYPE_STRUCT_OPS = 0x9 + BPF_LINK_TYPE_NETFILTER = 0xa + BPF_LINK_TYPE_TCX = 0xb + BPF_LINK_TYPE_UPROBE_MULTI = 0xc + BPF_PERF_EVENT_UNSPEC = 0x0 + BPF_PERF_EVENT_UPROBE = 0x1 + BPF_PERF_EVENT_URETPROBE = 0x2 + BPF_PERF_EVENT_KPROBE = 0x3 + BPF_PERF_EVENT_KRETPROBE = 0x4 + BPF_PERF_EVENT_TRACEPOINT = 0x5 + BPF_PERF_EVENT_EVENT = 0x6 + BPF_F_KPROBE_MULTI_RETURN = 0x1 + BPF_F_UPROBE_MULTI_RETURN = 0x1 BPF_ANY = 0x0 BPF_NOEXIST = 0x1 BPF_EXIST = 0x2 @@ -2743,6 +2824,8 @@ const ( BPF_F_MMAPABLE = 0x400 BPF_F_PRESERVE_ELEMS = 0x800 BPF_F_INNER_MAP = 0x1000 + BPF_F_LINK = 0x2000 + BPF_F_PATH_FD = 0x4000 BPF_STATS_RUN_TIME = 0x0 BPF_STACK_BUILD_ID_EMPTY = 0x0 BPF_STACK_BUILD_ID_VALID = 0x1 @@ -2763,6 +2846,7 @@ const ( BPF_F_ZERO_CSUM_TX = 0x2 BPF_F_DONT_FRAGMENT = 0x4 BPF_F_SEQ_NUMBER = 0x8 + BPF_F_NO_TUNNEL_KEY = 0x10 BPF_F_TUNINFO_FLAGS = 0x10 BPF_F_INDEX_MASK = 0xffffffff BPF_F_CURRENT_CPU = 0xffffffff @@ -2779,6 +2863,8 @@ const ( BPF_F_ADJ_ROOM_ENCAP_L4_UDP = 0x10 BPF_F_ADJ_ROOM_NO_CSUM_RESET = 0x20 BPF_F_ADJ_ROOM_ENCAP_L2_ETH = 0x40 + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = 0x80 + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = 0x100 BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 0x38 BPF_F_SYSCTL_BASE_NAME = 0x1 @@ -2867,6 +2953,8 @@ const ( BPF_DEVCG_DEV_CHAR = 0x2 BPF_FIB_LOOKUP_DIRECT = 0x1 BPF_FIB_LOOKUP_OUTPUT = 0x2 + BPF_FIB_LOOKUP_SKIP_NEIGH = 0x4 + BPF_FIB_LOOKUP_TBID = 0x8 BPF_FIB_LKUP_RET_SUCCESS = 0x0 BPF_FIB_LKUP_RET_BLACKHOLE = 0x1 BPF_FIB_LKUP_RET_UNREACHABLE = 0x2 @@ -2902,6 +2990,7 @@ const ( BPF_CORE_ENUMVAL_EXISTS = 0xa BPF_CORE_ENUMVAL_VALUE = 0xb BPF_CORE_TYPE_MATCHES = 0xc + BPF_F_TIMER_ABS = 0x1 ) const ( @@ -2980,6 +3069,12 @@ type LoopInfo64 struct { Encrypt_key [32]uint8 Init [2]uint64 } +type LoopConfig struct { + Fd uint32 + Size uint32 + Info LoopInfo64 + _ [8]uint64 +} type TIPCSocketAddr struct { Ref uint32 @@ -3368,7 +3463,7 @@ const ( DEVLINK_PORT_FN_ATTR_STATE = 0x2 DEVLINK_PORT_FN_ATTR_OPSTATE = 0x3 DEVLINK_PORT_FN_ATTR_CAPS = 0x4 - DEVLINK_PORT_FUNCTION_ATTR_MAX = 0x4 + DEVLINK_PORT_FUNCTION_ATTR_MAX = 0x5 ) type FsverityDigest struct { @@ -4152,7 +4247,8 @@ const ( ) type LandlockRulesetAttr struct { - Access_fs uint64 + Access_fs uint64 + Access_net uint64 } type LandlockPathBeneathAttr struct { @@ -4499,7 +4595,7 @@ const ( NL80211_ATTR_MAC_HINT = 0xc8 NL80211_ATTR_MAC_MASK = 0xd7 NL80211_ATTR_MAX_AP_ASSOC_STA = 0xca - NL80211_ATTR_MAX = 0x145 + NL80211_ATTR_MAX = 0x146 NL80211_ATTR_MAX_CRIT_PROT_DURATION = 0xb4 NL80211_ATTR_MAX_CSA_COUNTERS = 0xce NL80211_ATTR_MAX_MATCH_SETS = 0x85 @@ -4869,7 +4965,7 @@ const ( NL80211_CMD_LEAVE_IBSS = 0x2c NL80211_CMD_LEAVE_MESH = 0x45 NL80211_CMD_LEAVE_OCB = 0x6d - NL80211_CMD_MAX = 0x99 + NL80211_CMD_MAX = 0x9a NL80211_CMD_MICHAEL_MIC_FAILURE = 0x29 NL80211_CMD_MODIFY_LINK_STA = 0x97 NL80211_CMD_NAN_MATCH = 0x78 @@ -5103,7 +5199,7 @@ const ( NL80211_FREQUENCY_ATTR_GO_CONCURRENT = 0xf NL80211_FREQUENCY_ATTR_INDOOR_ONLY = 0xe NL80211_FREQUENCY_ATTR_IR_CONCURRENT = 0xf - NL80211_FREQUENCY_ATTR_MAX = 0x1b + NL80211_FREQUENCY_ATTR_MAX = 0x1c NL80211_FREQUENCY_ATTR_MAX_TX_POWER = 0x6 NL80211_FREQUENCY_ATTR_NO_10MHZ = 0x11 NL80211_FREQUENCY_ATTR_NO_160MHZ = 0xc @@ -5503,7 +5599,7 @@ const ( NL80211_RATE_INFO_HE_RU_ALLOC_52 = 0x1 NL80211_RATE_INFO_HE_RU_ALLOC_996 = 0x5 NL80211_RATE_INFO_HE_RU_ALLOC = 0x11 - NL80211_RATE_INFO_MAX = 0x16 + NL80211_RATE_INFO_MAX = 0x1d NL80211_RATE_INFO_MCS = 0x2 NL80211_RATE_INFO_SHORT_GI = 0x4 NL80211_RATE_INFO_VHT_MCS = 0x6 @@ -5516,7 +5612,7 @@ const ( NL80211_REGDOM_TYPE_CUSTOM_WORLD = 0x2 NL80211_REGDOM_TYPE_INTERSECTION = 0x3 NL80211_REGDOM_TYPE_WORLD = 0x1 - NL80211_REG_RULE_ATTR_MAX = 0x7 + NL80211_REG_RULE_ATTR_MAX = 0x8 NL80211_REKEY_DATA_AKM = 0x4 NL80211_REKEY_DATA_KCK = 0x2 NL80211_REKEY_DATA_KEK = 0x1 @@ -5868,3 +5964,30 @@ const ( VIRTIO_NET_HDR_GSO_UDP_L4 = 0x5 VIRTIO_NET_HDR_GSO_ECN = 0x80 ) + +type SchedAttr struct { + Size uint32 + Policy uint32 + Flags uint64 + Nice int32 + Priority uint32 + Runtime uint64 + Deadline uint64 + Period uint64 + Util_min uint32 + Util_max uint32 +} + +const SizeofSchedAttr = 0x38 + +type Cachestat_t struct { + Cache uint64 + Dirty uint64 + Writeback uint64 + Evicted uint64 + Recently_evicted uint64 +} +type CachestatRange struct { + Off uint64 + Len uint64 +} diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_386.go b/vendor/golang.org/x/sys/unix/ztypes_linux_386.go index 6d8acbcc5..438a30aff 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_386.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && linux -// +build 386,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go index 59293c688..adceca355 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && linux -// +build amd64,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go index 40cfa38c2..eeaa00a37 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && linux -// +build arm,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go index 055bc4216..6739aa91d 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && linux -// +build arm64,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go index f28affbc6..9920ef631 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_loong64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build loong64 && linux -// +build loong64,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go index 9d71e7ccd..2923b799a 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips && linux -// +build mips,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go index fd5ccd332..ce2750ee4 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64 && linux -// +build mips64,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go index 7704de77a..3038811d7 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64le && linux -// +build mips64le,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go index df00b8757..efc6fed18 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mipsle && linux -// +build mipsle,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go index 0942840db..9a654b75a 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc && linux -// +build ppc,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go index 034874395..40d358e33 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64 && linux -// +build ppc64,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go index bad067047..148c6ceb8 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64le && linux -// +build ppc64le,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go index 83c69c119..72ba81543 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && linux -// +build riscv64,linux package unix @@ -733,6 +732,10 @@ const ( RISCV_HWPROBE_KEY_IMA_EXT_0 = 0x4 RISCV_HWPROBE_IMA_FD = 0x1 RISCV_HWPROBE_IMA_C = 0x2 + RISCV_HWPROBE_IMA_V = 0x4 + RISCV_HWPROBE_EXT_ZBA = 0x8 + RISCV_HWPROBE_EXT_ZBB = 0x10 + RISCV_HWPROBE_EXT_ZBS = 0x20 RISCV_HWPROBE_KEY_CPUPERF_0 = 0x5 RISCV_HWPROBE_MISALIGNED_UNKNOWN = 0x0 RISCV_HWPROBE_MISALIGNED_EMULATED = 0x1 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go b/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go index aa268d025..71e765508 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build s390x && linux -// +build s390x,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go index 444045b6c..4abbdb9de 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build sparc64 && linux -// +build sparc64,linux package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_netbsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_netbsd_386.go index 9bc4c8f9d..f22e7947d 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_netbsd_386.go +++ b/vendor/golang.org/x/sys/unix/ztypes_netbsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && netbsd -// +build 386,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_netbsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_netbsd_amd64.go index bb05f655d..066a7d83d 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_netbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_netbsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && netbsd -// +build amd64,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm.go index db40e3a19..439548ec9 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && netbsd -// +build arm,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm64.go index 11121151c..16085d3bb 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_netbsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && netbsd -// +build arm64,netbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go index 26eba23b7..afd13a3af 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && openbsd -// +build 386,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go index 5a5479886..5d97f1f9b 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && openbsd -// +build amd64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go index be58c4e1f..34871cdc1 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && openbsd -// +build arm,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go index 52338266c..5911bceb3 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && openbsd -// +build arm64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go index 605cfdb12..e4f24f3bc 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64 && openbsd -// +build mips64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_ppc64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_ppc64.go index d6724c010..ca50a7930 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_ppc64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_ppc64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64 && openbsd -// +build ppc64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_riscv64.go index ddfd27a43..d7d7f7902 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_riscv64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && openbsd -// +build riscv64,openbsd package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go index 0400747c6..14160576d 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go @@ -2,7 +2,6 @@ // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && solaris -// +build amd64,solaris package unix diff --git a/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go b/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go index aec1efcb3..54f31be63 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go +++ b/vendor/golang.org/x/sys/unix/ztypes_zos_s390x.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build zos && s390x -// +build zos,s390x // Hand edited based on ztypes_linux_s390x.go // TODO: auto-generate. diff --git a/vendor/golang.org/x/sys/windows/aliases.go b/vendor/golang.org/x/sys/windows/aliases.go index a20ebea63..ce2d713d6 100644 --- a/vendor/golang.org/x/sys/windows/aliases.go +++ b/vendor/golang.org/x/sys/windows/aliases.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build windows && go1.9 -// +build windows,go1.9 package windows diff --git a/vendor/golang.org/x/sys/windows/empty.s b/vendor/golang.org/x/sys/windows/empty.s index fdbbbcd31..ba64caca5 100644 --- a/vendor/golang.org/x/sys/windows/empty.s +++ b/vendor/golang.org/x/sys/windows/empty.s @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build !go1.12 -// +build !go1.12 // This file is here to allow bodyless functions with go:linkname for Go 1.11 // and earlier (see https://golang.org/issue/23311). diff --git a/vendor/golang.org/x/sys/windows/env_windows.go b/vendor/golang.org/x/sys/windows/env_windows.go index b8ad19250..d4577a423 100644 --- a/vendor/golang.org/x/sys/windows/env_windows.go +++ b/vendor/golang.org/x/sys/windows/env_windows.go @@ -37,14 +37,17 @@ func (token Token) Environ(inheritExisting bool) (env []string, err error) { return nil, err } defer DestroyEnvironmentBlock(block) - blockp := unsafe.Pointer(block) - for { - entry := UTF16PtrToString((*uint16)(blockp)) - if len(entry) == 0 { - break + size := unsafe.Sizeof(*block) + for *block != 0 { + // find NUL terminator + end := unsafe.Pointer(block) + for *(*uint16)(end) != 0 { + end = unsafe.Add(end, size) } - env = append(env, entry) - blockp = unsafe.Add(blockp, 2*(len(entry)+1)) + + entry := unsafe.Slice(block, (uintptr(end)-uintptr(unsafe.Pointer(block)))/size) + env = append(env, UTF16ToString(entry)) + block = (*uint16)(unsafe.Add(end, size)) } return env, nil } diff --git a/vendor/golang.org/x/sys/windows/eventlog.go b/vendor/golang.org/x/sys/windows/eventlog.go index 2cd60645e..6c366955d 100644 --- a/vendor/golang.org/x/sys/windows/eventlog.go +++ b/vendor/golang.org/x/sys/windows/eventlog.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build windows -// +build windows package windows diff --git a/vendor/golang.org/x/sys/windows/exec_windows.go b/vendor/golang.org/x/sys/windows/exec_windows.go index a52e0331d..9cabbb694 100644 --- a/vendor/golang.org/x/sys/windows/exec_windows.go +++ b/vendor/golang.org/x/sys/windows/exec_windows.go @@ -22,7 +22,7 @@ import ( // but only if there is space or tab inside s. func EscapeArg(s string) string { if len(s) == 0 { - return "\"\"" + return `""` } n := len(s) hasSpace := false @@ -35,7 +35,7 @@ func EscapeArg(s string) string { } } if hasSpace { - n += 2 + n += 2 // Reserve space for quotes. } if n == len(s) { return s @@ -82,20 +82,68 @@ func EscapeArg(s string) string { // in CreateProcess's CommandLine argument, CreateService/ChangeServiceConfig's BinaryPathName argument, // or any program that uses CommandLineToArgv. func ComposeCommandLine(args []string) string { - var commandLine string - for i := range args { - if i > 0 { - commandLine += " " - } - commandLine += EscapeArg(args[i]) + if len(args) == 0 { + return "" } - return commandLine + + // Per https://learn.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-commandlinetoargvw: + // “This function accepts command lines that contain a program name; the + // program name can be enclosed in quotation marks or not.” + // + // Unfortunately, it provides no means of escaping interior quotation marks + // within that program name, and we have no way to report them here. + prog := args[0] + mustQuote := len(prog) == 0 + for i := 0; i < len(prog); i++ { + c := prog[i] + if c <= ' ' || (c == '"' && i == 0) { + // Force quotes for not only the ASCII space and tab as described in the + // MSDN article, but also ASCII control characters. + // The documentation for CommandLineToArgvW doesn't say what happens when + // the first argument is not a valid program name, but it empirically + // seems to drop unquoted control characters. + mustQuote = true + break + } + } + var commandLine []byte + if mustQuote { + commandLine = make([]byte, 0, len(prog)+2) + commandLine = append(commandLine, '"') + for i := 0; i < len(prog); i++ { + c := prog[i] + if c == '"' { + // This quote would interfere with our surrounding quotes. + // We have no way to report an error, so just strip out + // the offending character instead. + continue + } + commandLine = append(commandLine, c) + } + commandLine = append(commandLine, '"') + } else { + if len(args) == 1 { + // args[0] is a valid command line representing itself. + // No need to allocate a new slice or string for it. + return prog + } + commandLine = []byte(prog) + } + + for _, arg := range args[1:] { + commandLine = append(commandLine, ' ') + // TODO(bcmills): since we're already appending to a slice, it would be nice + // to avoid the intermediate allocations of EscapeArg. + // Perhaps we can factor out an appendEscapedArg function. + commandLine = append(commandLine, EscapeArg(arg)...) + } + return string(commandLine) } // DecomposeCommandLine breaks apart its argument command line into unescaped parts using CommandLineToArgv, // as gathered from GetCommandLine, QUERY_SERVICE_CONFIG's BinaryPathName argument, or elsewhere that // command lines are passed around. -// DecomposeCommandLine returns error if commandLine contains NUL. +// DecomposeCommandLine returns an error if commandLine contains NUL. func DecomposeCommandLine(commandLine string) ([]string, error) { if len(commandLine) == 0 { return []string{}, nil @@ -105,18 +153,35 @@ func DecomposeCommandLine(commandLine string) ([]string, error) { return nil, errorspkg.New("string with NUL passed to DecomposeCommandLine") } var argc int32 - argv, err := CommandLineToArgv(&utf16CommandLine[0], &argc) + argv, err := commandLineToArgv(&utf16CommandLine[0], &argc) if err != nil { return nil, err } defer LocalFree(Handle(unsafe.Pointer(argv))) + var args []string - for _, v := range (*argv)[:argc] { - args = append(args, UTF16ToString((*v)[:])) + for _, p := range unsafe.Slice(argv, argc) { + args = append(args, UTF16PtrToString(p)) } return args, nil } +// CommandLineToArgv parses a Unicode command line string and sets +// argc to the number of parsed arguments. +// +// The returned memory should be freed using a single call to LocalFree. +// +// Note that although the return type of CommandLineToArgv indicates 8192 +// entries of up to 8192 characters each, the actual count of parsed arguments +// may exceed 8192, and the documentation for CommandLineToArgvW does not mention +// any bound on the lengths of the individual argument strings. +// (See https://go.dev/issue/63236.) +func CommandLineToArgv(cmd *uint16, argc *int32) (argv *[8192]*[8192]uint16, err error) { + argp, err := commandLineToArgv(cmd, argc) + argv = (*[8192]*[8192]uint16)(unsafe.Pointer(argp)) + return argv, err +} + func CloseOnExec(fd Handle) { SetHandleInformation(Handle(fd), HANDLE_FLAG_INHERIT, 0) } diff --git a/vendor/golang.org/x/sys/windows/mksyscall.go b/vendor/golang.org/x/sys/windows/mksyscall.go index 8563f79c5..dbcdb090c 100644 --- a/vendor/golang.org/x/sys/windows/mksyscall.go +++ b/vendor/golang.org/x/sys/windows/mksyscall.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build generate -// +build generate package windows diff --git a/vendor/golang.org/x/sys/windows/race.go b/vendor/golang.org/x/sys/windows/race.go index 9196b089c..0f1bdc386 100644 --- a/vendor/golang.org/x/sys/windows/race.go +++ b/vendor/golang.org/x/sys/windows/race.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build windows && race -// +build windows,race package windows diff --git a/vendor/golang.org/x/sys/windows/race0.go b/vendor/golang.org/x/sys/windows/race0.go index 7bae4817a..0c78da78b 100644 --- a/vendor/golang.org/x/sys/windows/race0.go +++ b/vendor/golang.org/x/sys/windows/race0.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build windows && !race -// +build windows,!race package windows diff --git a/vendor/golang.org/x/sys/windows/security_windows.go b/vendor/golang.org/x/sys/windows/security_windows.go index d414ef13b..26be94a8a 100644 --- a/vendor/golang.org/x/sys/windows/security_windows.go +++ b/vendor/golang.org/x/sys/windows/security_windows.go @@ -7,8 +7,6 @@ package windows import ( "syscall" "unsafe" - - "golang.org/x/sys/internal/unsafeheader" ) const ( @@ -1341,21 +1339,14 @@ func (selfRelativeSD *SECURITY_DESCRIPTOR) copySelfRelativeSecurityDescriptor() sdLen = min } - var src []byte - h := (*unsafeheader.Slice)(unsafe.Pointer(&src)) - h.Data = unsafe.Pointer(selfRelativeSD) - h.Len = sdLen - h.Cap = sdLen - + src := unsafe.Slice((*byte)(unsafe.Pointer(selfRelativeSD)), sdLen) + // SECURITY_DESCRIPTOR has pointers in it, which means checkptr expects for it to + // be aligned properly. When we're copying a Windows-allocated struct to a + // Go-allocated one, make sure that the Go allocation is aligned to the + // pointer size. const psize = int(unsafe.Sizeof(uintptr(0))) - - var dst []byte - h = (*unsafeheader.Slice)(unsafe.Pointer(&dst)) alloc := make([]uintptr, (sdLen+psize-1)/psize) - h.Data = (*unsafeheader.Slice)(unsafe.Pointer(&alloc)).Data - h.Len = sdLen - h.Cap = sdLen - + dst := unsafe.Slice((*byte)(unsafe.Pointer(&alloc[0])), sdLen) copy(dst, src) return (*SECURITY_DESCRIPTOR)(unsafe.Pointer(&dst[0])) } diff --git a/vendor/golang.org/x/sys/windows/service.go b/vendor/golang.org/x/sys/windows/service.go index c44a1b963..a9dc6308d 100644 --- a/vendor/golang.org/x/sys/windows/service.go +++ b/vendor/golang.org/x/sys/windows/service.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build windows -// +build windows package windows diff --git a/vendor/golang.org/x/sys/windows/str.go b/vendor/golang.org/x/sys/windows/str.go index 4fc01434e..6a4f9ce6a 100644 --- a/vendor/golang.org/x/sys/windows/str.go +++ b/vendor/golang.org/x/sys/windows/str.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build windows -// +build windows package windows diff --git a/vendor/golang.org/x/sys/windows/syscall.go b/vendor/golang.org/x/sys/windows/syscall.go index 8732cdb95..e85ed6b9c 100644 --- a/vendor/golang.org/x/sys/windows/syscall.go +++ b/vendor/golang.org/x/sys/windows/syscall.go @@ -3,7 +3,6 @@ // license that can be found in the LICENSE file. //go:build windows -// +build windows // Package windows contains an interface to the low-level operating system // primitives. OS details vary depending on the underlying system, and diff --git a/vendor/golang.org/x/sys/windows/syscall_windows.go b/vendor/golang.org/x/sys/windows/syscall_windows.go index 373d16388..6395a031d 100644 --- a/vendor/golang.org/x/sys/windows/syscall_windows.go +++ b/vendor/golang.org/x/sys/windows/syscall_windows.go @@ -15,8 +15,6 @@ import ( "time" "unicode/utf16" "unsafe" - - "golang.org/x/sys/internal/unsafeheader" ) type Handle uintptr @@ -127,8 +125,7 @@ func UTF16PtrToString(p *uint16) string { for ptr := unsafe.Pointer(p); *(*uint16)(ptr) != 0; n++ { ptr = unsafe.Pointer(uintptr(ptr) + unsafe.Sizeof(*p)) } - - return string(utf16.Decode(unsafe.Slice(p, n))) + return UTF16ToString(unsafe.Slice(p, n)) } func Getpagesize() int { return 4096 } @@ -157,6 +154,8 @@ func NewCallbackCDecl(fn interface{}) uintptr { //sys GetModuleFileName(module Handle, filename *uint16, size uint32) (n uint32, err error) = kernel32.GetModuleFileNameW //sys GetModuleHandleEx(flags uint32, moduleName *uint16, module *Handle) (err error) = kernel32.GetModuleHandleExW //sys SetDefaultDllDirectories(directoryFlags uint32) (err error) +//sys AddDllDirectory(path *uint16) (cookie uintptr, err error) = kernel32.AddDllDirectory +//sys RemoveDllDirectory(cookie uintptr) (err error) = kernel32.RemoveDllDirectory //sys SetDllDirectory(path string) (err error) = kernel32.SetDllDirectoryW //sys GetVersion() (ver uint32, err error) //sys FormatMessage(flags uint32, msgsrc uintptr, msgid uint32, langid uint32, buf []uint16, args *byte) (n uint32, err error) = FormatMessageW @@ -194,6 +193,7 @@ func NewCallbackCDecl(fn interface{}) uintptr { //sys GetComputerName(buf *uint16, n *uint32) (err error) = GetComputerNameW //sys GetComputerNameEx(nametype uint32, buf *uint16, n *uint32) (err error) = GetComputerNameExW //sys SetEndOfFile(handle Handle) (err error) +//sys SetFileValidData(handle Handle, validDataLength int64) (err error) //sys GetSystemTimeAsFileTime(time *Filetime) //sys GetSystemTimePreciseAsFileTime(time *Filetime) //sys GetTimeZoneInformation(tzi *Timezoneinformation) (rc uint32, err error) [failretval==0xffffffff] @@ -216,7 +216,7 @@ func NewCallbackCDecl(fn interface{}) uintptr { //sys shGetKnownFolderPath(id *KNOWNFOLDERID, flags uint32, token Token, path **uint16) (ret error) = shell32.SHGetKnownFolderPath //sys TerminateProcess(handle Handle, exitcode uint32) (err error) //sys GetExitCodeProcess(handle Handle, exitcode *uint32) (err error) -//sys GetStartupInfo(startupInfo *StartupInfo) (err error) = GetStartupInfoW +//sys getStartupInfo(startupInfo *StartupInfo) = GetStartupInfoW //sys GetProcessTimes(handle Handle, creationTime *Filetime, exitTime *Filetime, kernelTime *Filetime, userTime *Filetime) (err error) //sys DuplicateHandle(hSourceProcessHandle Handle, hSourceHandle Handle, hTargetProcessHandle Handle, lpTargetHandle *Handle, dwDesiredAccess uint32, bInheritHandle bool, dwOptions uint32) (err error) //sys WaitForSingleObject(handle Handle, waitMilliseconds uint32) (event uint32, err error) [failretval==0xffffffff] @@ -235,12 +235,13 @@ func NewCallbackCDecl(fn interface{}) uintptr { //sys CreateEnvironmentBlock(block **uint16, token Token, inheritExisting bool) (err error) = userenv.CreateEnvironmentBlock //sys DestroyEnvironmentBlock(block *uint16) (err error) = userenv.DestroyEnvironmentBlock //sys getTickCount64() (ms uint64) = kernel32.GetTickCount64 +//sys GetFileTime(handle Handle, ctime *Filetime, atime *Filetime, wtime *Filetime) (err error) //sys SetFileTime(handle Handle, ctime *Filetime, atime *Filetime, wtime *Filetime) (err error) //sys GetFileAttributes(name *uint16) (attrs uint32, err error) [failretval==INVALID_FILE_ATTRIBUTES] = kernel32.GetFileAttributesW //sys SetFileAttributes(name *uint16, attrs uint32) (err error) = kernel32.SetFileAttributesW //sys GetFileAttributesEx(name *uint16, level uint32, info *byte) (err error) = kernel32.GetFileAttributesExW //sys GetCommandLine() (cmd *uint16) = kernel32.GetCommandLineW -//sys CommandLineToArgv(cmd *uint16, argc *int32) (argv *[8192]*[8192]uint16, err error) [failretval==nil] = shell32.CommandLineToArgvW +//sys commandLineToArgv(cmd *uint16, argc *int32) (argv **uint16, err error) [failretval==nil] = shell32.CommandLineToArgvW //sys LocalFree(hmem Handle) (handle Handle, err error) [failretval!=0] //sys LocalAlloc(flags uint32, length uint32) (ptr uintptr, err error) //sys SetHandleInformation(handle Handle, mask uint32, flags uint32) (err error) @@ -299,12 +300,15 @@ func NewCallbackCDecl(fn interface{}) uintptr { //sys RegNotifyChangeKeyValue(key Handle, watchSubtree bool, notifyFilter uint32, event Handle, asynchronous bool) (regerrno error) = advapi32.RegNotifyChangeKeyValue //sys GetCurrentProcessId() (pid uint32) = kernel32.GetCurrentProcessId //sys ProcessIdToSessionId(pid uint32, sessionid *uint32) (err error) = kernel32.ProcessIdToSessionId +//sys ClosePseudoConsole(console Handle) = kernel32.ClosePseudoConsole +//sys createPseudoConsole(size uint32, in Handle, out Handle, flags uint32, pconsole *Handle) (hr error) = kernel32.CreatePseudoConsole //sys GetConsoleMode(console Handle, mode *uint32) (err error) = kernel32.GetConsoleMode //sys SetConsoleMode(console Handle, mode uint32) (err error) = kernel32.SetConsoleMode //sys GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) = kernel32.GetConsoleScreenBufferInfo //sys setConsoleCursorPosition(console Handle, position uint32) (err error) = kernel32.SetConsoleCursorPosition //sys WriteConsole(console Handle, buf *uint16, towrite uint32, written *uint32, reserved *byte) (err error) = kernel32.WriteConsoleW //sys ReadConsole(console Handle, buf *uint16, toread uint32, read *uint32, inputControl *byte) (err error) = kernel32.ReadConsoleW +//sys resizePseudoConsole(pconsole Handle, size uint32) (hr error) = kernel32.ResizePseudoConsole //sys CreateToolhelp32Snapshot(flags uint32, processId uint32) (handle Handle, err error) [failretval==InvalidHandle] = kernel32.CreateToolhelp32Snapshot //sys Module32First(snapshot Handle, moduleEntry *ModuleEntry32) (err error) = kernel32.Module32FirstW //sys Module32Next(snapshot Handle, moduleEntry *ModuleEntry32) (err error) = kernel32.Module32NextW @@ -437,6 +441,10 @@ func NewCallbackCDecl(fn interface{}) uintptr { //sys DwmGetWindowAttribute(hwnd HWND, attribute uint32, value unsafe.Pointer, size uint32) (ret error) = dwmapi.DwmGetWindowAttribute //sys DwmSetWindowAttribute(hwnd HWND, attribute uint32, value unsafe.Pointer, size uint32) (ret error) = dwmapi.DwmSetWindowAttribute +// Windows Multimedia API +//sys TimeBeginPeriod (period uint32) (err error) [failretval != 0] = winmm.timeBeginPeriod +//sys TimeEndPeriod (period uint32) (err error) [failretval != 0] = winmm.timeEndPeriod + // syscall interface implementation for other packages // GetCurrentProcess returns the handle for the current process. @@ -964,7 +972,8 @@ func (sa *SockaddrUnix) sockaddr() (unsafe.Pointer, int32, error) { if n > 0 { sl += int32(n) + 1 } - if sa.raw.Path[0] == '@' { + if sa.raw.Path[0] == '@' || (sa.raw.Path[0] == 0 && sl > 3) { + // Check sl > 3 so we don't change unnamed socket behavior. sa.raw.Path[0] = 0 // Don't count trailing NUL for abstract address. sl-- @@ -1624,6 +1633,11 @@ func SetConsoleCursorPosition(console Handle, position Coord) error { return setConsoleCursorPosition(console, *((*uint32)(unsafe.Pointer(&position)))) } +func GetStartupInfo(startupInfo *StartupInfo) error { + getStartupInfo(startupInfo) + return nil +} + func (s NTStatus) Errno() syscall.Errno { return rtlNtStatusToDosErrorNoTeb(s) } @@ -1658,12 +1672,8 @@ func NewNTUnicodeString(s string) (*NTUnicodeString, error) { // Slice returns a uint16 slice that aliases the data in the NTUnicodeString. func (s *NTUnicodeString) Slice() []uint16 { - var slice []uint16 - hdr := (*unsafeheader.Slice)(unsafe.Pointer(&slice)) - hdr.Data = unsafe.Pointer(s.Buffer) - hdr.Len = int(s.Length) - hdr.Cap = int(s.MaximumLength) - return slice + slice := unsafe.Slice(s.Buffer, s.MaximumLength) + return slice[:s.Length] } func (s *NTUnicodeString) String() string { @@ -1686,12 +1696,8 @@ func NewNTString(s string) (*NTString, error) { // Slice returns a byte slice that aliases the data in the NTString. func (s *NTString) Slice() []byte { - var slice []byte - hdr := (*unsafeheader.Slice)(unsafe.Pointer(&slice)) - hdr.Data = unsafe.Pointer(s.Buffer) - hdr.Len = int(s.Length) - hdr.Cap = int(s.MaximumLength) - return slice + slice := unsafe.Slice(s.Buffer, s.MaximumLength) + return slice[:s.Length] } func (s *NTString) String() string { @@ -1743,10 +1749,7 @@ func LoadResourceData(module, resInfo Handle) (data []byte, err error) { if err != nil { return } - h := (*unsafeheader.Slice)(unsafe.Pointer(&data)) - h.Data = unsafe.Pointer(ptr) - h.Len = int(size) - h.Cap = int(size) + data = unsafe.Slice((*byte)(unsafe.Pointer(ptr)), size) return } @@ -1817,3 +1820,17 @@ type PSAPI_WORKING_SET_EX_INFORMATION struct { // A PSAPI_WORKING_SET_EX_BLOCK union that indicates the attributes of the page at VirtualAddress. VirtualAttributes PSAPI_WORKING_SET_EX_BLOCK } + +// CreatePseudoConsole creates a windows pseudo console. +func CreatePseudoConsole(size Coord, in Handle, out Handle, flags uint32, pconsole *Handle) error { + // We need this wrapper to manually cast Coord to uint32. The autogenerated wrappers only + // accept arguments that can be casted to uintptr, and Coord can't. + return createPseudoConsole(*((*uint32)(unsafe.Pointer(&size))), in, out, flags, pconsole) +} + +// ResizePseudoConsole resizes the internal buffers of the pseudo console to the width and height specified in `size`. +func ResizePseudoConsole(pconsole Handle, size Coord) error { + // We need this wrapper to manually cast Coord to uint32. The autogenerated wrappers only + // accept arguments that can be casted to uintptr, and Coord can't. + return resizePseudoConsole(pconsole, *((*uint32)(unsafe.Pointer(&size)))) +} diff --git a/vendor/golang.org/x/sys/windows/types_windows.go b/vendor/golang.org/x/sys/windows/types_windows.go index 88e62a638..359780f6a 100644 --- a/vendor/golang.org/x/sys/windows/types_windows.go +++ b/vendor/golang.org/x/sys/windows/types_windows.go @@ -247,6 +247,7 @@ const ( PROC_THREAD_ATTRIBUTE_MITIGATION_POLICY = 0x00020007 PROC_THREAD_ATTRIBUTE_UMS_THREAD = 0x00030006 PROC_THREAD_ATTRIBUTE_PROTECTION_LEVEL = 0x0002000b + PROC_THREAD_ATTRIBUTE_PSEUDOCONSOLE = 0x00020016 ) const ( @@ -1093,7 +1094,33 @@ const ( SOMAXCONN = 0x7fffffff - TCP_NODELAY = 1 + TCP_NODELAY = 1 + TCP_EXPEDITED_1122 = 2 + TCP_KEEPALIVE = 3 + TCP_MAXSEG = 4 + TCP_MAXRT = 5 + TCP_STDURG = 6 + TCP_NOURG = 7 + TCP_ATMARK = 8 + TCP_NOSYNRETRIES = 9 + TCP_TIMESTAMPS = 10 + TCP_OFFLOAD_PREFERENCE = 11 + TCP_CONGESTION_ALGORITHM = 12 + TCP_DELAY_FIN_ACK = 13 + TCP_MAXRTMS = 14 + TCP_FASTOPEN = 15 + TCP_KEEPCNT = 16 + TCP_KEEPIDLE = TCP_KEEPALIVE + TCP_KEEPINTVL = 17 + TCP_FAIL_CONNECT_ON_ICMP_ERROR = 18 + TCP_ICMP_ERROR_INFO = 19 + + UDP_NOCHECKSUM = 1 + UDP_SEND_MSG_SIZE = 2 + UDP_RECV_MAX_COALESCED_SIZE = 3 + UDP_CHECKSUM_COVERAGE = 20 + + UDP_COALESCED_INFO = 3 SHUT_RD = 0 SHUT_WR = 1 @@ -2139,6 +2166,12 @@ const ( ENABLE_LVB_GRID_WORLDWIDE = 0x10 ) +// Pseudo console related constants used for the flags parameter to +// CreatePseudoConsole. See: https://learn.microsoft.com/en-us/windows/console/createpseudoconsole +const ( + PSEUDOCONSOLE_INHERIT_CURSOR = 0x1 +) + type Coord struct { X int16 Y int16 diff --git a/vendor/golang.org/x/sys/windows/zsyscall_windows.go b/vendor/golang.org/x/sys/windows/zsyscall_windows.go index 566dd3e31..e8791c82c 100644 --- a/vendor/golang.org/x/sys/windows/zsyscall_windows.go +++ b/vendor/golang.org/x/sys/windows/zsyscall_windows.go @@ -55,6 +55,7 @@ var ( moduser32 = NewLazySystemDLL("user32.dll") moduserenv = NewLazySystemDLL("userenv.dll") modversion = NewLazySystemDLL("version.dll") + modwinmm = NewLazySystemDLL("winmm.dll") modwintrust = NewLazySystemDLL("wintrust.dll") modws2_32 = NewLazySystemDLL("ws2_32.dll") modwtsapi32 = NewLazySystemDLL("wtsapi32.dll") @@ -183,10 +184,12 @@ var ( procGetAdaptersInfo = modiphlpapi.NewProc("GetAdaptersInfo") procGetBestInterfaceEx = modiphlpapi.NewProc("GetBestInterfaceEx") procGetIfEntry = modiphlpapi.NewProc("GetIfEntry") + procAddDllDirectory = modkernel32.NewProc("AddDllDirectory") procAssignProcessToJobObject = modkernel32.NewProc("AssignProcessToJobObject") procCancelIo = modkernel32.NewProc("CancelIo") procCancelIoEx = modkernel32.NewProc("CancelIoEx") procCloseHandle = modkernel32.NewProc("CloseHandle") + procClosePseudoConsole = modkernel32.NewProc("ClosePseudoConsole") procConnectNamedPipe = modkernel32.NewProc("ConnectNamedPipe") procCreateDirectoryW = modkernel32.NewProc("CreateDirectoryW") procCreateEventExW = modkernel32.NewProc("CreateEventExW") @@ -201,6 +204,7 @@ var ( procCreateNamedPipeW = modkernel32.NewProc("CreateNamedPipeW") procCreatePipe = modkernel32.NewProc("CreatePipe") procCreateProcessW = modkernel32.NewProc("CreateProcessW") + procCreatePseudoConsole = modkernel32.NewProc("CreatePseudoConsole") procCreateSymbolicLinkW = modkernel32.NewProc("CreateSymbolicLinkW") procCreateToolhelp32Snapshot = modkernel32.NewProc("CreateToolhelp32Snapshot") procDefineDosDeviceW = modkernel32.NewProc("DefineDosDeviceW") @@ -250,6 +254,7 @@ var ( procGetFileAttributesW = modkernel32.NewProc("GetFileAttributesW") procGetFileInformationByHandle = modkernel32.NewProc("GetFileInformationByHandle") procGetFileInformationByHandleEx = modkernel32.NewProc("GetFileInformationByHandleEx") + procGetFileTime = modkernel32.NewProc("GetFileTime") procGetFileType = modkernel32.NewProc("GetFileType") procGetFinalPathNameByHandleW = modkernel32.NewProc("GetFinalPathNameByHandleW") procGetFullPathNameW = modkernel32.NewProc("GetFullPathNameW") @@ -326,7 +331,9 @@ var ( procReadProcessMemory = modkernel32.NewProc("ReadProcessMemory") procReleaseMutex = modkernel32.NewProc("ReleaseMutex") procRemoveDirectoryW = modkernel32.NewProc("RemoveDirectoryW") + procRemoveDllDirectory = modkernel32.NewProc("RemoveDllDirectory") procResetEvent = modkernel32.NewProc("ResetEvent") + procResizePseudoConsole = modkernel32.NewProc("ResizePseudoConsole") procResumeThread = modkernel32.NewProc("ResumeThread") procSetCommTimeouts = modkernel32.NewProc("SetCommTimeouts") procSetConsoleCursorPosition = modkernel32.NewProc("SetConsoleCursorPosition") @@ -335,6 +342,7 @@ var ( procSetDefaultDllDirectories = modkernel32.NewProc("SetDefaultDllDirectories") procSetDllDirectoryW = modkernel32.NewProc("SetDllDirectoryW") procSetEndOfFile = modkernel32.NewProc("SetEndOfFile") + procSetFileValidData = modkernel32.NewProc("SetFileValidData") procSetEnvironmentVariableW = modkernel32.NewProc("SetEnvironmentVariableW") procSetErrorMode = modkernel32.NewProc("SetErrorMode") procSetEvent = modkernel32.NewProc("SetEvent") @@ -468,6 +476,8 @@ var ( procGetFileVersionInfoSizeW = modversion.NewProc("GetFileVersionInfoSizeW") procGetFileVersionInfoW = modversion.NewProc("GetFileVersionInfoW") procVerQueryValueW = modversion.NewProc("VerQueryValueW") + proctimeBeginPeriod = modwinmm.NewProc("timeBeginPeriod") + proctimeEndPeriod = modwinmm.NewProc("timeEndPeriod") procWinVerifyTrustEx = modwintrust.NewProc("WinVerifyTrustEx") procFreeAddrInfoW = modws2_32.NewProc("FreeAddrInfoW") procGetAddrInfoW = modws2_32.NewProc("GetAddrInfoW") @@ -1598,6 +1608,15 @@ func GetIfEntry(pIfRow *MibIfRow) (errcode error) { return } +func AddDllDirectory(path *uint16) (cookie uintptr, err error) { + r0, _, e1 := syscall.Syscall(procAddDllDirectory.Addr(), 1, uintptr(unsafe.Pointer(path)), 0, 0) + cookie = uintptr(r0) + if cookie == 0 { + err = errnoErr(e1) + } + return +} + func AssignProcessToJobObject(job Handle, process Handle) (err error) { r1, _, e1 := syscall.Syscall(procAssignProcessToJobObject.Addr(), 2, uintptr(job), uintptr(process), 0) if r1 == 0 { @@ -1630,6 +1649,11 @@ func CloseHandle(handle Handle) (err error) { return } +func ClosePseudoConsole(console Handle) { + syscall.Syscall(procClosePseudoConsole.Addr(), 1, uintptr(console), 0, 0) + return +} + func ConnectNamedPipe(pipe Handle, overlapped *Overlapped) (err error) { r1, _, e1 := syscall.Syscall(procConnectNamedPipe.Addr(), 2, uintptr(pipe), uintptr(unsafe.Pointer(overlapped)), 0) if r1 == 0 { @@ -1759,6 +1783,14 @@ func CreateProcess(appName *uint16, commandLine *uint16, procSecurity *SecurityA return } +func createPseudoConsole(size uint32, in Handle, out Handle, flags uint32, pconsole *Handle) (hr error) { + r0, _, _ := syscall.Syscall6(procCreatePseudoConsole.Addr(), 5, uintptr(size), uintptr(in), uintptr(out), uintptr(flags), uintptr(unsafe.Pointer(pconsole)), 0) + if r0 != 0 { + hr = syscall.Errno(r0) + } + return +} + func CreateSymbolicLink(symlinkfilename *uint16, targetfilename *uint16, flags uint32) (err error) { r1, _, e1 := syscall.Syscall(procCreateSymbolicLinkW.Addr(), 3, uintptr(unsafe.Pointer(symlinkfilename)), uintptr(unsafe.Pointer(targetfilename)), uintptr(flags)) if r1&0xff == 0 { @@ -2166,6 +2198,14 @@ func GetFileInformationByHandleEx(handle Handle, class uint32, outBuffer *byte, return } +func GetFileTime(handle Handle, ctime *Filetime, atime *Filetime, wtime *Filetime) (err error) { + r1, _, e1 := syscall.Syscall6(procGetFileTime.Addr(), 4, uintptr(handle), uintptr(unsafe.Pointer(ctime)), uintptr(unsafe.Pointer(atime)), uintptr(unsafe.Pointer(wtime)), 0, 0) + if r1 == 0 { + err = errnoErr(e1) + } + return +} + func GetFileType(filehandle Handle) (n uint32, err error) { r0, _, e1 := syscall.Syscall(procGetFileType.Addr(), 1, uintptr(filehandle), 0, 0) n = uint32(r0) @@ -2367,11 +2407,8 @@ func GetShortPathName(longpath *uint16, shortpath *uint16, buflen uint32) (n uin return } -func GetStartupInfo(startupInfo *StartupInfo) (err error) { - r1, _, e1 := syscall.Syscall(procGetStartupInfoW.Addr(), 1, uintptr(unsafe.Pointer(startupInfo)), 0, 0) - if r1 == 0 { - err = errnoErr(e1) - } +func getStartupInfo(startupInfo *StartupInfo) { + syscall.Syscall(procGetStartupInfoW.Addr(), 1, uintptr(unsafe.Pointer(startupInfo)), 0, 0) return } @@ -2854,6 +2891,14 @@ func RemoveDirectory(path *uint16) (err error) { return } +func RemoveDllDirectory(cookie uintptr) (err error) { + r1, _, e1 := syscall.Syscall(procRemoveDllDirectory.Addr(), 1, uintptr(cookie), 0, 0) + if r1 == 0 { + err = errnoErr(e1) + } + return +} + func ResetEvent(event Handle) (err error) { r1, _, e1 := syscall.Syscall(procResetEvent.Addr(), 1, uintptr(event), 0, 0) if r1 == 0 { @@ -2862,6 +2907,14 @@ func ResetEvent(event Handle) (err error) { return } +func resizePseudoConsole(pconsole Handle, size uint32) (hr error) { + r0, _, _ := syscall.Syscall(procResizePseudoConsole.Addr(), 2, uintptr(pconsole), uintptr(size), 0) + if r0 != 0 { + hr = syscall.Errno(r0) + } + return +} + func ResumeThread(thread Handle) (ret uint32, err error) { r0, _, e1 := syscall.Syscall(procResumeThread.Addr(), 1, uintptr(thread), 0, 0) ret = uint32(r0) @@ -2936,6 +2989,14 @@ func SetEndOfFile(handle Handle) (err error) { return } +func SetFileValidData(handle Handle, validDataLength int64) (err error) { + r1, _, e1 := syscall.Syscall(procSetFileValidData.Addr(), 2, uintptr(handle), uintptr(validDataLength), 0) + if r1 == 0 { + err = errnoErr(e1) + } + return +} + func SetEnvironmentVariable(name *uint16, value *uint16) (err error) { r1, _, e1 := syscall.Syscall(procSetEnvironmentVariableW.Addr(), 2, uintptr(unsafe.Pointer(name)), uintptr(unsafe.Pointer(value)), 0) if r1 == 0 { @@ -3820,9 +3881,9 @@ func setupUninstallOEMInf(infFileName *uint16, flags SUOI, reserved uintptr) (er return } -func CommandLineToArgv(cmd *uint16, argc *int32) (argv *[8192]*[8192]uint16, err error) { +func commandLineToArgv(cmd *uint16, argc *int32) (argv **uint16, err error) { r0, _, e1 := syscall.Syscall(procCommandLineToArgvW.Addr(), 2, uintptr(unsafe.Pointer(cmd)), uintptr(unsafe.Pointer(argc)), 0) - argv = (*[8192]*[8192]uint16)(unsafe.Pointer(r0)) + argv = (**uint16)(unsafe.Pointer(r0)) if argv == nil { err = errnoErr(e1) } @@ -4017,6 +4078,22 @@ func _VerQueryValue(block unsafe.Pointer, subBlock *uint16, pointerToBufferPoint return } +func TimeBeginPeriod(period uint32) (err error) { + r1, _, e1 := syscall.Syscall(proctimeBeginPeriod.Addr(), 1, uintptr(period), 0, 0) + if r1 != 0 { + err = errnoErr(e1) + } + return +} + +func TimeEndPeriod(period uint32) (err error) { + r1, _, e1 := syscall.Syscall(proctimeEndPeriod.Addr(), 1, uintptr(period), 0, 0) + if r1 != 0 { + err = errnoErr(e1) + } + return +} + func WinVerifyTrustEx(hwnd HWND, actionId *GUID, data *WinTrustData) (ret error) { r0, _, _ := syscall.Syscall(procWinVerifyTrustEx.Addr(), 3, uintptr(hwnd), uintptr(unsafe.Pointer(actionId)), uintptr(unsafe.Pointer(data))) if r0 != 0 { diff --git a/vendor/modules.txt b/vendor/modules.txt index d25103fce..a1959a9a0 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -461,9 +461,12 @@ github.com/klauspost/compress/internal/cpuinfo github.com/klauspost/compress/internal/snapref github.com/klauspost/compress/zstd github.com/klauspost/compress/zstd/internal/xxhash -# github.com/klauspost/cpuid/v2 v2.2.5 +# github.com/klauspost/cpuid/v2 v2.2.7 ## explicit; go 1.15 github.com/klauspost/cpuid/v2 +# github.com/klauspost/reedsolomon v1.12.1 +## explicit; go 1.18 +github.com/klauspost/reedsolomon # github.com/koron/go-ssdp v0.0.4 ## explicit; go 1.19 github.com/koron/go-ssdp @@ -1238,11 +1241,10 @@ golang.org/x/net/route ## explicit; go 1.17 golang.org/x/sync/errgroup golang.org/x/sync/singleflight -# golang.org/x/sys v0.11.0 -## explicit; go 1.17 +# golang.org/x/sys v0.18.0 +## explicit; go 1.18 golang.org/x/sys/cpu golang.org/x/sys/execabs -golang.org/x/sys/internal/unsafeheader golang.org/x/sys/plan9 golang.org/x/sys/unix golang.org/x/sys/windows