status-go/vendor/lukechampine.com/blake3/guts/compress_amd64.s

5568 lines
127 KiB
ArmAsm
Raw Normal View History

// Code generated by command: go run gen.go -out compress_amd64.s. DO NOT EDIT.
#include "textflag.h"
DATA iv<>+0(SB)/4, $0x6a09e667
DATA iv<>+4(SB)/4, $0xbb67ae85
DATA iv<>+8(SB)/4, $0x3c6ef372
DATA iv<>+12(SB)/4, $0xa54ff53a
GLOBL iv<>(SB), RODATA|NOPTR, $16
DATA seq<>+0(SB)/4, $0x00000000
DATA seq<>+4(SB)/4, $0x00000001
DATA seq<>+8(SB)/4, $0x00000002
DATA seq<>+12(SB)/4, $0x00000003
DATA seq<>+16(SB)/4, $0x00000004
DATA seq<>+20(SB)/4, $0x00000005
DATA seq<>+24(SB)/4, $0x00000006
DATA seq<>+28(SB)/4, $0x00000007
DATA seq<>+32(SB)/4, $0x00000008
DATA seq<>+36(SB)/4, $0x00000009
DATA seq<>+40(SB)/4, $0x0000000a
DATA seq<>+44(SB)/4, $0x0000000b
DATA seq<>+48(SB)/4, $0x0000000c
DATA seq<>+52(SB)/4, $0x0000000d
DATA seq<>+56(SB)/4, $0x0000000e
DATA seq<>+60(SB)/4, $0x0000000f
GLOBL seq<>(SB), RODATA|NOPTR, $64
DATA seq64<>+0(SB)/8, $0x0000000000000000
DATA seq64<>+8(SB)/8, $0x0000000000000001
DATA seq64<>+16(SB)/8, $0x0000000000000002
DATA seq64<>+24(SB)/8, $0x0000000000000003
DATA seq64<>+32(SB)/8, $0x0000000000000004
DATA seq64<>+40(SB)/8, $0x0000000000000005
DATA seq64<>+48(SB)/8, $0x0000000000000006
DATA seq64<>+56(SB)/8, $0x0000000000000007
GLOBL seq64<>(SB), RODATA|NOPTR, $64
DATA shuffle_rot8<>+0(SB)/4, $0x00030201
DATA shuffle_rot8<>+4(SB)/4, $0x04070605
DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09
DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d
DATA shuffle_rot8<>+16(SB)/4, $0x10131211
DATA shuffle_rot8<>+20(SB)/4, $0x14171615
DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19
DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d
GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32
DATA shuffle_rot16<>+0(SB)/4, $0x01000302
DATA shuffle_rot16<>+4(SB)/4, $0x05040706
DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a
DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e
DATA shuffle_rot16<>+16(SB)/4, $0x11101312
DATA shuffle_rot16<>+20(SB)/4, $0x15141716
DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a
DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e
GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32
// func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
// Requires: AVX512BW, AVX512F
TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40
MOVQ out+0(FP), AX
MOVQ block+8(FP), CX
MOVQ cv+16(FP), DX
// Initialize block vectors
VPBROADCASTD (CX), Z1
VPBROADCASTD 4(CX), Z3
VPBROADCASTD 8(CX), Z5
VPBROADCASTD 12(CX), Z7
VPBROADCASTD 16(CX), Z9
VPBROADCASTD 20(CX), Z11
VPBROADCASTD 24(CX), Z13
VPBROADCASTD 28(CX), Z15
VPBROADCASTD 32(CX), Z17
VPBROADCASTD 36(CX), Z19
VPBROADCASTD 40(CX), Z21
VPBROADCASTD 44(CX), Z23
VPBROADCASTD 48(CX), Z25
VPBROADCASTD 52(CX), Z27
VPBROADCASTD 56(CX), Z29
VPBROADCASTD 60(CX), Z31
// Initialize state vectors
VPBROADCASTD (DX), Z0
VPBROADCASTD 4(DX), Z2
VPBROADCASTD 8(DX), Z4
VPBROADCASTD 12(DX), Z6
VPBROADCASTD 16(DX), Z8
VPBROADCASTD 20(DX), Z10
VPBROADCASTD 24(DX), Z12
VPBROADCASTD 28(DX), Z14
VPBROADCASTD iv<>+0(SB), Z16
VPBROADCASTD iv<>+4(SB), Z18
VPBROADCASTD iv<>+8(SB), Z20
VPBROADCASTD iv<>+12(SB), Z22
VPBROADCASTD counter+24(FP), Z24
VPADDD seq<>+0(SB), Z24, Z24
VPCMPUD $0x01, seq<>+0(SB), Z24, K1
VPBROADCASTD counter+28(FP), Z26
VPADDD.BCST seq<>+4(SB), Z26, K1, Z26
VPBROADCASTD blockLen+32(FP), Z28
VPBROADCASTD flags+36(FP), Z30
// Round 1
VPADDD Z0, Z8, Z0
VPADDD Z1, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z3, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z5, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z7, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z9, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z11, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z13, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z15, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z17, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z19, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z21, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z23, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z25, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z27, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z29, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z31, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 2
VPADDD Z0, Z8, Z0
VPADDD Z5, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z13, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z7, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z21, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z15, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z1, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z9, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z27, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z3, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z23, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z25, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z11, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z19, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z29, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z31, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z17, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 3
VPADDD Z0, Z8, Z0
VPADDD Z7, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z9, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z21, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z25, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z27, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z5, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z15, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z29, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z13, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z11, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z19, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z1, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z23, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z31, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z17, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z3, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 4
VPADDD Z0, Z8, Z0
VPADDD Z21, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z15, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z25, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z19, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z29, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z7, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z27, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z31, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z9, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z1, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z23, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z5, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z11, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z17, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z3, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z13, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 5
VPADDD Z0, Z8, Z0
VPADDD Z25, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z27, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z19, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z23, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z31, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z21, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z29, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z17, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z15, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z5, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z11, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z7, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z1, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z3, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z13, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z9, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 6
VPADDD Z0, Z8, Z0
VPADDD Z19, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z29, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z23, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z11, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z17, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z25, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z31, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z3, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z27, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z7, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z1, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z21, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z5, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z13, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z9, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z15, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 7
VPADDD Z0, Z8, Z0
VPADDD Z23, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z31, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z11, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z1, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z3, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z19, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z17, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z13, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z29, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z21, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z5, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z25, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z7, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z9, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z15, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z27, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Finalize CVs
VPXORD Z0, Z16, Z0
VPXORD Z2, Z18, Z2
VPXORD Z4, Z20, Z4
VPXORD Z6, Z22, Z6
VPXORD Z8, Z24, Z8
VPXORD Z10, Z26, Z10
VPXORD Z12, Z28, Z12
VPXORD Z14, Z30, Z14
VPXORD.BCST (DX), Z16, Z16
VPXORD.BCST 4(DX), Z18, Z18
VPXORD.BCST 8(DX), Z20, Z20
VPXORD.BCST 12(DX), Z22, Z22
VPXORD.BCST 16(DX), Z24, Z24
VPXORD.BCST 20(DX), Z26, Z26
VPXORD.BCST 24(DX), Z28, Z28
VPXORD.BCST 28(DX), Z30, Z30
VMOVDQU32 seq<>+0(SB), Z1
VPSLLD $0x06, Z1, Z1
KXNORD K1, K1, K1
VPSCATTERDD Z0, K1, (AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z2, K1, 4(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z4, K1, 8(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z6, K1, 12(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z8, K1, 16(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z10, K1, 20(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z12, K1, 24(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z14, K1, 28(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z16, K1, 32(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z18, K1, 36(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z20, K1, 40(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z22, K1, 44(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z24, K1, 48(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z26, K1, 52(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z28, K1, 56(AX)(Z1*1)
KXNORD K1, K1, K1
VPSCATTERDD Z30, K1, 60(AX)(Z1*1)
RET
// func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)
// Requires: AVX512BW, AVX512F
TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-36
MOVQ cvs+0(FP), AX
MOVQ buf+8(FP), CX
MOVQ key+16(FP), DX
// Initialize counter
VPBROADCASTD counter+24(FP), Z0
VPADDD seq<>+0(SB), Z0, Z0
VPCMPUD $0x01, seq<>+0(SB), Z0, K1
VPBROADCASTD counter+28(FP), Z2
VPADDD.BCST seq<>+4(SB), Z2, K1, Z2
VMOVDQU32 Z0, (SP)
VMOVDQU32 Z2, 64(SP)
// Initialize flags
VPBROADCASTD flags+32(FP), Z0
VMOVDQU32 Z0, 128(SP)
ORL $0x01, 128(SP)
ORL $0x02, 188(SP)
// Load key
VPBROADCASTD (DX), Z0
VPBROADCASTD 4(DX), Z2
VPBROADCASTD 8(DX), Z4
VPBROADCASTD 12(DX), Z6
VPBROADCASTD 16(DX), Z8
VPBROADCASTD 20(DX), Z10
VPBROADCASTD 24(DX), Z12
VPBROADCASTD 28(DX), Z14
// Loop index
XORQ DX, DX
loop:
// Load transposed block
VMOVDQU32 seq<>+0(SB), Z16
VPSLLD $0x0a, Z16, Z16
KXNORD K1, K1, K1
VPGATHERDD (CX)(Z16*1), K1, Z1
KXNORD K1, K1, K1
VPGATHERDD 4(CX)(Z16*1), K1, Z3
KXNORD K1, K1, K1
VPGATHERDD 8(CX)(Z16*1), K1, Z5
KXNORD K1, K1, K1
VPGATHERDD 12(CX)(Z16*1), K1, Z7
KXNORD K1, K1, K1
VPGATHERDD 16(CX)(Z16*1), K1, Z9
KXNORD K1, K1, K1
VPGATHERDD 20(CX)(Z16*1), K1, Z11
KXNORD K1, K1, K1
VPGATHERDD 24(CX)(Z16*1), K1, Z13
KXNORD K1, K1, K1
VPGATHERDD 28(CX)(Z16*1), K1, Z15
KXNORD K1, K1, K1
VPGATHERDD 32(CX)(Z16*1), K1, Z17
KXNORD K1, K1, K1
VPGATHERDD 36(CX)(Z16*1), K1, Z19
KXNORD K1, K1, K1
VPGATHERDD 40(CX)(Z16*1), K1, Z21
KXNORD K1, K1, K1
VPGATHERDD 44(CX)(Z16*1), K1, Z23
KXNORD K1, K1, K1
VPGATHERDD 48(CX)(Z16*1), K1, Z25
KXNORD K1, K1, K1
VPGATHERDD 52(CX)(Z16*1), K1, Z27
KXNORD K1, K1, K1
VPGATHERDD 56(CX)(Z16*1), K1, Z29
KXNORD K1, K1, K1
VPGATHERDD 60(CX)(Z16*1), K1, Z31
ADDQ $0x40, CX
// Reload state vectors (other than CVs)
VPBROADCASTD iv<>+0(SB), Z16
VPBROADCASTD iv<>+4(SB), Z18
VPBROADCASTD iv<>+8(SB), Z20
VPBROADCASTD iv<>+12(SB), Z22
VMOVDQU32 (SP), Z24
VMOVDQU32 64(SP), Z26
VPBROADCASTD seq<>+4(SB), Z28
VPSLLD $0x06, Z28, Z28
VPBROADCASTD 128(SP)(DX*4), Z30
// Round 1
VPADDD Z0, Z8, Z0
VPADDD Z1, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z3, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z5, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z7, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z9, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z11, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z13, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z15, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z17, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z19, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z21, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z23, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z25, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z27, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z29, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z31, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 2
VPADDD Z0, Z8, Z0
VPADDD Z5, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z13, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z7, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z21, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z15, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z1, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z9, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z27, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z3, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z23, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z25, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z11, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z19, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z29, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z31, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z17, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 3
VPADDD Z0, Z8, Z0
VPADDD Z7, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z9, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z21, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z25, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z27, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z5, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z15, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z29, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z13, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z11, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z19, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z1, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z23, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z31, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z17, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z3, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 4
VPADDD Z0, Z8, Z0
VPADDD Z21, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z15, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z25, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z19, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z29, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z7, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z27, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z31, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z9, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z1, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z23, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z5, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z11, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z17, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z3, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z13, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 5
VPADDD Z0, Z8, Z0
VPADDD Z25, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z27, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z19, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z23, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z31, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z21, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z29, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z17, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z15, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z5, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z11, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z7, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z1, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z3, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z13, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z9, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 6
VPADDD Z0, Z8, Z0
VPADDD Z19, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z29, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z23, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z11, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z17, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z25, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z31, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z3, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z27, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z7, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z1, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z21, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z5, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z13, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z9, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z15, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Round 7
VPADDD Z0, Z8, Z0
VPADDD Z23, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z0, Z8, Z0
VPADDD Z31, Z0, Z0
VPXORD Z24, Z0, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z16, Z24, Z16
VPXORD Z8, Z16, Z8
VPRORD $0x07, Z8, Z8
VPADDD Z2, Z10, Z2
VPADDD Z11, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z2, Z10, Z2
VPADDD Z1, Z2, Z2
VPXORD Z26, Z2, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z18, Z26, Z18
VPXORD Z10, Z18, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z4, Z12, Z4
VPADDD Z3, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z4, Z12, Z4
VPADDD Z19, Z4, Z4
VPXORD Z28, Z4, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z20, Z28, Z20
VPXORD Z12, Z20, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z6, Z14, Z6
VPADDD Z17, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z6, Z14, Z6
VPADDD Z13, Z6, Z6
VPXORD Z30, Z6, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z22, Z30, Z22
VPXORD Z14, Z22, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z0, Z10, Z0
VPADDD Z29, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x10, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x0c, Z10, Z10
VPADDD Z0, Z10, Z0
VPADDD Z21, Z0, Z0
VPXORD Z30, Z0, Z30
VPRORD $0x08, Z30, Z30
VPADDD Z20, Z30, Z20
VPXORD Z10, Z20, Z10
VPRORD $0x07, Z10, Z10
VPADDD Z2, Z12, Z2
VPADDD Z5, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x10, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x0c, Z12, Z12
VPADDD Z2, Z12, Z2
VPADDD Z25, Z2, Z2
VPXORD Z24, Z2, Z24
VPRORD $0x08, Z24, Z24
VPADDD Z22, Z24, Z22
VPXORD Z12, Z22, Z12
VPRORD $0x07, Z12, Z12
VPADDD Z4, Z14, Z4
VPADDD Z7, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x10, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x0c, Z14, Z14
VPADDD Z4, Z14, Z4
VPADDD Z9, Z4, Z4
VPXORD Z26, Z4, Z26
VPRORD $0x08, Z26, Z26
VPADDD Z16, Z26, Z16
VPXORD Z14, Z16, Z14
VPRORD $0x07, Z14, Z14
VPADDD Z6, Z8, Z6
VPADDD Z15, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x10, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x0c, Z8, Z8
VPADDD Z6, Z8, Z6
VPADDD Z27, Z6, Z6
VPXORD Z28, Z6, Z28
VPRORD $0x08, Z28, Z28
VPADDD Z18, Z28, Z18
VPXORD Z8, Z18, Z8
VPRORD $0x07, Z8, Z8
// Finalize CVs
VPXORD Z0, Z16, Z0
VPXORD Z2, Z18, Z2
VPXORD Z4, Z20, Z4
VPXORD Z6, Z22, Z6
VPXORD Z8, Z24, Z8
VPXORD Z10, Z26, Z10
VPXORD Z12, Z28, Z12
VPXORD Z14, Z30, Z14
// Loop
INCQ DX
CMPQ DX, $0x00000010
JNE loop
// Finished; transpose CVs
VMOVDQU32 seq<>+0(SB), Z16
VPSLLD $0x05, Z16, Z16
KXNORD K1, K1, K1
VPSCATTERDD Z0, K1, (AX)(Z16*1)
KXNORD K1, K1, K1
VPSCATTERDD Z2, K1, 4(AX)(Z16*1)
KXNORD K1, K1, K1
VPSCATTERDD Z4, K1, 8(AX)(Z16*1)
KXNORD K1, K1, K1
VPSCATTERDD Z6, K1, 12(AX)(Z16*1)
KXNORD K1, K1, K1
VPSCATTERDD Z8, K1, 16(AX)(Z16*1)
KXNORD K1, K1, K1
VPSCATTERDD Z10, K1, 20(AX)(Z16*1)
KXNORD K1, K1, K1
VPSCATTERDD Z12, K1, 24(AX)(Z16*1)
KXNORD K1, K1, K1
VPSCATTERDD Z14, K1, 28(AX)(Z16*1)
RET
// func compressBlocksAVX2(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
// Requires: AVX, AVX2
TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40
MOVQ out+0(FP), AX
MOVQ block+8(FP), CX
MOVQ cv+16(FP), DX
// Load block
VPBROADCASTD (CX), Y0
VMOVDQU Y0, (SP)
VPBROADCASTD 4(CX), Y0
VMOVDQU Y0, 32(SP)
VPBROADCASTD 8(CX), Y0
VMOVDQU Y0, 64(SP)
VPBROADCASTD 12(CX), Y0
VMOVDQU Y0, 96(SP)
VPBROADCASTD 16(CX), Y0
VMOVDQU Y0, 128(SP)
VPBROADCASTD 20(CX), Y0
VMOVDQU Y0, 160(SP)
VPBROADCASTD 24(CX), Y0
VMOVDQU Y0, 192(SP)
VPBROADCASTD 28(CX), Y0
VMOVDQU Y0, 224(SP)
VPBROADCASTD 32(CX), Y0
VMOVDQU Y0, 256(SP)
VPBROADCASTD 36(CX), Y0
VMOVDQU Y0, 288(SP)
VPBROADCASTD 40(CX), Y0
VMOVDQU Y0, 320(SP)
VPBROADCASTD 44(CX), Y0
VMOVDQU Y0, 352(SP)
VPBROADCASTD 48(CX), Y0
VMOVDQU Y0, 384(SP)
VPBROADCASTD 52(CX), Y0
VMOVDQU Y0, 416(SP)
VPBROADCASTD 56(CX), Y0
VMOVDQU Y0, 448(SP)
VPBROADCASTD 60(CX), Y0
VMOVDQU Y0, 480(SP)
// Initialize state vectors
VPBROADCASTD (DX), Y0
VPBROADCASTD 4(DX), Y1
VPBROADCASTD 8(DX), Y2
VPBROADCASTD 12(DX), Y3
VPBROADCASTD 16(DX), Y4
VPBROADCASTD 20(DX), Y5
VPBROADCASTD 24(DX), Y6
VPBROADCASTD 28(DX), Y7
VPBROADCASTD iv<>+0(SB), Y8
VPBROADCASTD iv<>+4(SB), Y9
VPBROADCASTD iv<>+8(SB), Y10
VPBROADCASTD iv<>+12(SB), Y11
VPBROADCASTQ counter+24(FP), Y12
VPBROADCASTQ counter+24(FP), Y13
VPADDQ seq64<>+0(SB), Y12, Y12
VPADDQ seq64<>+32(SB), Y13, Y13
VPUNPCKLDQ Y13, Y12, Y14
VPUNPCKHDQ Y13, Y12, Y15
VPUNPCKLDQ Y15, Y14, Y12
VPUNPCKHDQ Y15, Y14, Y13
VPERMQ $0xd8, Y12, Y12
VPERMQ $0xd8, Y13, Y13
VPBROADCASTD blockLen+32(FP), Y14
VPBROADCASTD flags+36(FP), Y15
VMOVDQU Y8, 512(SP)
// Round 1
VPADDD Y0, Y4, Y0
VPADDD (SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 32(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 128(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 160(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 256(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 288(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 384(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 416(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 2
VPADDD Y0, Y4, Y0
VPADDD 64(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 192(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 224(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD (SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 32(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 352(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 288(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 448(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 3
VPADDD Y0, Y4, Y0
VPADDD 96(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 128(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 416(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 64(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 192(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 160(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD (SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 352(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 480(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 4
VPADDD Y0, Y4, Y0
VPADDD 320(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 224(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 448(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 96(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 128(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD (SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 160(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 256(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 5
VPADDD Y0, Y4, Y0
VPADDD 384(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 416(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 480(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 320(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 224(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 64(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD (SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 32(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 6
VPADDD Y0, Y4, Y0
VPADDD 288(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 448(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 256(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 384(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 416(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 96(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD (SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 64(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 192(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 7
VPADDD Y0, Y4, Y0
VPADDD 352(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 480(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD (SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 32(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 288(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 448(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 320(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 96(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 128(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VMOVDQU 512(SP), Y8
// Finalize CVs
VMOVDQU Y8, 256(SP)
VMOVDQU Y9, 288(SP)
VMOVDQU Y10, 320(SP)
VMOVDQU Y11, 352(SP)
VMOVDQU Y12, 384(SP)
VMOVDQU Y13, 416(SP)
VMOVDQU Y14, 448(SP)
VMOVDQU Y15, 480(SP)
VPXOR Y0, Y8, Y0
VPXOR Y1, Y9, Y1
VPXOR Y2, Y10, Y2
VPXOR Y3, Y11, Y3
VPXOR Y4, Y12, Y4
VPXOR Y5, Y13, Y5
VPXOR Y6, Y14, Y6
VPXOR Y7, Y15, Y7
VPUNPCKLDQ Y1, Y0, Y8
VPUNPCKHDQ Y1, Y0, Y9
VPUNPCKLDQ Y3, Y2, Y10
VPUNPCKHDQ Y3, Y2, Y11
VPUNPCKLDQ Y5, Y4, Y12
VPUNPCKHDQ Y5, Y4, Y13
VPUNPCKLDQ Y7, Y6, Y14
VPUNPCKHDQ Y7, Y6, Y15
VPUNPCKLQDQ Y10, Y8, Y0
VPUNPCKHQDQ Y10, Y8, Y1
VPUNPCKLQDQ Y11, Y9, Y2
VPUNPCKHQDQ Y11, Y9, Y3
VPUNPCKLQDQ Y14, Y12, Y4
VPUNPCKHQDQ Y14, Y12, Y5
VPUNPCKLQDQ Y15, Y13, Y6
VPUNPCKHQDQ Y15, Y13, Y7
VPERM2I128 $0x20, Y4, Y0, Y8
VPERM2I128 $0x31, Y4, Y0, Y12
VPERM2I128 $0x20, Y5, Y1, Y9
VPERM2I128 $0x31, Y5, Y1, Y13
VPERM2I128 $0x20, Y6, Y2, Y10
VPERM2I128 $0x31, Y6, Y2, Y14
VPERM2I128 $0x20, Y7, Y3, Y11
VPERM2I128 $0x31, Y7, Y3, Y15
VMOVDQU Y8, (AX)
VMOVDQU Y9, 64(AX)
VMOVDQU Y10, 128(AX)
VMOVDQU Y11, 192(AX)
VMOVDQU Y12, 256(AX)
VMOVDQU Y13, 320(AX)
VMOVDQU Y14, 384(AX)
VMOVDQU Y15, 448(AX)
VMOVDQU 256(SP), Y8
VMOVDQU 288(SP), Y9
VMOVDQU 320(SP), Y10
VMOVDQU 352(SP), Y11
VMOVDQU 384(SP), Y12
VMOVDQU 416(SP), Y13
VMOVDQU 448(SP), Y14
VMOVDQU 480(SP), Y15
VPBROADCASTD (DX), Y0
VPXOR Y0, Y8, Y8
VPBROADCASTD 4(DX), Y0
VPXOR Y0, Y9, Y9
VPBROADCASTD 8(DX), Y0
VPXOR Y0, Y10, Y10
VPBROADCASTD 12(DX), Y0
VPXOR Y0, Y11, Y11
VPBROADCASTD 16(DX), Y0
VPXOR Y0, Y12, Y12
VPBROADCASTD 20(DX), Y0
VPXOR Y0, Y13, Y13
VPBROADCASTD 24(DX), Y0
VPXOR Y0, Y14, Y14
VPBROADCASTD 28(DX), Y0
VPXOR Y0, Y15, Y15
VPUNPCKLDQ Y9, Y8, Y0
VPUNPCKHDQ Y9, Y8, Y1
VPUNPCKLDQ Y11, Y10, Y2
VPUNPCKHDQ Y11, Y10, Y3
VPUNPCKLDQ Y13, Y12, Y4
VPUNPCKHDQ Y13, Y12, Y5
VPUNPCKLDQ Y15, Y14, Y6
VPUNPCKHDQ Y15, Y14, Y7
VPUNPCKLQDQ Y2, Y0, Y8
VPUNPCKHQDQ Y2, Y0, Y9
VPUNPCKLQDQ Y3, Y1, Y10
VPUNPCKHQDQ Y3, Y1, Y11
VPUNPCKLQDQ Y6, Y4, Y12
VPUNPCKHQDQ Y6, Y4, Y13
VPUNPCKLQDQ Y7, Y5, Y14
VPUNPCKHQDQ Y7, Y5, Y15
VPERM2I128 $0x20, Y12, Y8, Y0
VPERM2I128 $0x31, Y12, Y8, Y4
VPERM2I128 $0x20, Y13, Y9, Y1
VPERM2I128 $0x31, Y13, Y9, Y5
VPERM2I128 $0x20, Y14, Y10, Y2
VPERM2I128 $0x31, Y14, Y10, Y6
VPERM2I128 $0x20, Y15, Y11, Y3
VPERM2I128 $0x31, Y15, Y11, Y7
VMOVDQU Y0, 32(AX)
VMOVDQU Y1, 96(AX)
VMOVDQU Y2, 160(AX)
VMOVDQU Y3, 224(AX)
VMOVDQU Y4, 288(AX)
VMOVDQU Y5, 352(AX)
VMOVDQU Y6, 416(AX)
VMOVDQU Y7, 480(AX)
VZEROUPPER
RET
// func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
// Requires: AVX, AVX2
TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-36
MOVQ cvs+0(FP), AX
MOVQ buf+8(FP), CX
MOVQ key+16(FP), DX
// Load key
VPBROADCASTD (DX), Y0
VPBROADCASTD 4(DX), Y1
VPBROADCASTD 8(DX), Y2
VPBROADCASTD 12(DX), Y3
VPBROADCASTD 16(DX), Y4
VPBROADCASTD 20(DX), Y5
VPBROADCASTD 24(DX), Y6
VPBROADCASTD 28(DX), Y7
// Initialize counter
VPBROADCASTQ counter+24(FP), Y12
VPBROADCASTQ counter+24(FP), Y13
VPADDQ seq64<>+0(SB), Y12, Y12
VPADDQ seq64<>+32(SB), Y13, Y13
VPUNPCKLDQ Y13, Y12, Y14
VPUNPCKHDQ Y13, Y12, Y15
VPUNPCKLDQ Y15, Y14, Y12
VPUNPCKHDQ Y15, Y14, Y13
VPERMQ $0xd8, Y12, Y12
VPERMQ $0xd8, Y13, Y13
VMOVDQU Y12, 512(SP)
VMOVDQU Y13, 544(SP)
// Initialize flags
VPBROADCASTD flags+32(FP), Y14
VMOVDQU Y14, 576(SP)
VMOVDQU Y14, 608(SP)
ORL $0x01, 576(SP)
ORL $0x02, 636(SP)
// Loop index
XORQ DX, DX
loop:
// Load transposed block
VMOVDQU seq<>+0(SB), Y9
VPSLLD $0x0a, Y9, Y9
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, (CX)(Y9*1), Y10
VMOVDQU Y10, (SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 4(CX)(Y9*1), Y10
VMOVDQU Y10, 32(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 8(CX)(Y9*1), Y10
VMOVDQU Y10, 64(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 12(CX)(Y9*1), Y10
VMOVDQU Y10, 96(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 16(CX)(Y9*1), Y10
VMOVDQU Y10, 128(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 20(CX)(Y9*1), Y10
VMOVDQU Y10, 160(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 24(CX)(Y9*1), Y10
VMOVDQU Y10, 192(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 28(CX)(Y9*1), Y10
VMOVDQU Y10, 224(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 32(CX)(Y9*1), Y10
VMOVDQU Y10, 256(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 36(CX)(Y9*1), Y10
VMOVDQU Y10, 288(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 40(CX)(Y9*1), Y10
VMOVDQU Y10, 320(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 44(CX)(Y9*1), Y10
VMOVDQU Y10, 352(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 48(CX)(Y9*1), Y10
VMOVDQU Y10, 384(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 52(CX)(Y9*1), Y10
VMOVDQU Y10, 416(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 56(CX)(Y9*1), Y10
VMOVDQU Y10, 448(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 60(CX)(Y9*1), Y10
VMOVDQU Y10, 480(SP)
ADDQ $0x40, CX
// Reload state vectors (other than CVs)
VPBROADCASTD iv<>+0(SB), Y8
VPBROADCASTD iv<>+4(SB), Y9
VPBROADCASTD iv<>+8(SB), Y10
VPBROADCASTD iv<>+12(SB), Y11
VMOVDQU 512(SP), Y12
VMOVDQU 544(SP), Y13
VPBROADCASTD seq<>+4(SB), Y14
VPSLLD $0x06, Y14, Y14
VPBROADCASTD 576(SP)(DX*4), Y15
VMOVDQU Y8, 640(SP)
// Round 1
VPADDD Y0, Y4, Y0
VPADDD (SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 32(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 128(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 160(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 256(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 288(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 384(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 416(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 2
VPADDD Y0, Y4, Y0
VPADDD 64(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 192(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 224(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD (SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 32(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 352(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 288(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 448(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 3
VPADDD Y0, Y4, Y0
VPADDD 96(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 128(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 416(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 64(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 192(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 160(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD (SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 352(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 480(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 4
VPADDD Y0, Y4, Y0
VPADDD 320(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 224(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 448(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 96(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 128(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD (SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 160(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 256(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 5
VPADDD Y0, Y4, Y0
VPADDD 384(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 416(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 480(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 320(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 224(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 64(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD (SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 32(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 6
VPADDD Y0, Y4, Y0
VPADDD 288(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 448(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 256(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 384(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 416(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 96(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD (SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 64(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 192(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 7
VPADDD Y0, Y4, Y0
VPADDD 352(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 480(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 640(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD (SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 32(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 288(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 448(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 320(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 96(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 128(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 640(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 640(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VMOVDQU 640(SP), Y8
// Finalize CVs
VPXOR Y0, Y8, Y0
VPXOR Y1, Y9, Y1
VPXOR Y2, Y10, Y2
VPXOR Y3, Y11, Y3
VPXOR Y4, Y12, Y4
VPXOR Y5, Y13, Y5
VPXOR Y6, Y14, Y6
VPXOR Y7, Y15, Y7
// Loop
INCQ DX
CMPQ DX, $0x00000010
JNE loop
// Finished; transpose CVs
VPUNPCKLDQ Y1, Y0, Y8
VPUNPCKHDQ Y1, Y0, Y9
VPUNPCKLDQ Y3, Y2, Y10
VPUNPCKHDQ Y3, Y2, Y11
VPUNPCKLDQ Y5, Y4, Y12
VPUNPCKHDQ Y5, Y4, Y13
VPUNPCKLDQ Y7, Y6, Y14
VPUNPCKHDQ Y7, Y6, Y15
VPUNPCKLQDQ Y10, Y8, Y0
VPUNPCKHQDQ Y10, Y8, Y1
VPUNPCKLQDQ Y11, Y9, Y2
VPUNPCKHQDQ Y11, Y9, Y3
VPUNPCKLQDQ Y14, Y12, Y4
VPUNPCKHQDQ Y14, Y12, Y5
VPUNPCKLQDQ Y15, Y13, Y6
VPUNPCKHQDQ Y15, Y13, Y7
VPERM2I128 $0x20, Y4, Y0, Y8
VPERM2I128 $0x31, Y4, Y0, Y12
VPERM2I128 $0x20, Y5, Y1, Y9
VPERM2I128 $0x31, Y5, Y1, Y13
VPERM2I128 $0x20, Y6, Y2, Y10
VPERM2I128 $0x31, Y6, Y2, Y14
VPERM2I128 $0x20, Y7, Y3, Y11
VPERM2I128 $0x31, Y7, Y3, Y15
VMOVDQU Y8, (AX)
VMOVDQU Y9, 32(AX)
VMOVDQU Y10, 64(AX)
VMOVDQU Y11, 96(AX)
VMOVDQU Y12, 128(AX)
VMOVDQU Y13, 160(AX)
VMOVDQU Y14, 192(AX)
VMOVDQU Y15, 224(AX)
VZEROUPPER
RET
// func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
// Requires: AVX, AVX2
TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-28
MOVQ parents+0(FP), AX
MOVQ cvs+8(FP), CX
MOVQ key+16(FP), DX
// Load transposed block
VMOVDQU seq<>+0(SB), Y9
VPSLLD $0x06, Y9, Y9
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, (CX)(Y9*1), Y10
VMOVDQU Y10, (SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 4(CX)(Y9*1), Y10
VMOVDQU Y10, 32(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 8(CX)(Y9*1), Y10
VMOVDQU Y10, 64(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 12(CX)(Y9*1), Y10
VMOVDQU Y10, 96(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 16(CX)(Y9*1), Y10
VMOVDQU Y10, 128(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 20(CX)(Y9*1), Y10
VMOVDQU Y10, 160(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 24(CX)(Y9*1), Y10
VMOVDQU Y10, 192(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 28(CX)(Y9*1), Y10
VMOVDQU Y10, 224(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 32(CX)(Y9*1), Y10
VMOVDQU Y10, 256(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 36(CX)(Y9*1), Y10
VMOVDQU Y10, 288(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 40(CX)(Y9*1), Y10
VMOVDQU Y10, 320(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 44(CX)(Y9*1), Y10
VMOVDQU Y10, 352(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 48(CX)(Y9*1), Y10
VMOVDQU Y10, 384(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 52(CX)(Y9*1), Y10
VMOVDQU Y10, 416(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 56(CX)(Y9*1), Y10
VMOVDQU Y10, 448(SP)
VPCMPEQD Y8, Y8, Y8
VPGATHERDD Y8, 60(CX)(Y9*1), Y10
VMOVDQU Y10, 480(SP)
// Initialize state vectors
VPBROADCASTD (DX), Y0
VPBROADCASTD 4(DX), Y1
VPBROADCASTD 8(DX), Y2
VPBROADCASTD 12(DX), Y3
VPBROADCASTD 16(DX), Y4
VPBROADCASTD 20(DX), Y5
VPBROADCASTD 24(DX), Y6
VPBROADCASTD 28(DX), Y7
VPBROADCASTD iv<>+0(SB), Y8
VPBROADCASTD iv<>+4(SB), Y9
VPBROADCASTD iv<>+8(SB), Y10
VPBROADCASTD iv<>+12(SB), Y11
VPXOR Y12, Y12, Y12
VPXOR Y13, Y13, Y13
VPBROADCASTD seq<>+4(SB), Y14
VPSLLD $0x06, Y14, Y14
ORL $0x04, flags+24(FP)
VPBROADCASTD flags+24(FP), Y15
VMOVDQU Y8, 512(SP)
// Round 1
VPADDD Y0, Y4, Y0
VPADDD (SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 32(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 128(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 160(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 256(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 288(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 384(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 416(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 2
VPADDD Y0, Y4, Y0
VPADDD 64(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 192(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 224(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD (SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 32(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 352(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 288(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 448(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 3
VPADDD Y0, Y4, Y0
VPADDD 96(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 128(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 416(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 64(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 192(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 160(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD (SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 352(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 480(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 4
VPADDD Y0, Y4, Y0
VPADDD 320(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 224(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 448(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 96(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 128(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD (SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 160(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 256(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 5
VPADDD Y0, Y4, Y0
VPADDD 384(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 416(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 288(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 480(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 320(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 448(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 224(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 64(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 96(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD (SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 32(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 6
VPADDD Y0, Y4, Y0
VPADDD 288(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 448(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 352(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 256(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 384(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 480(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 32(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 416(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 96(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD (SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 320(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 64(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 192(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 128(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
// Round 7
VPADDD Y0, Y4, Y0
VPADDD 352(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y0, Y4, Y0
VPADDD 480(SP), Y0, Y0
VPXOR Y12, Y0, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VMOVDQU 512(SP), Y8
VPADDD Y8, Y12, Y8
VPXOR Y4, Y8, Y4
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y1, Y5, Y1
VPADDD 160(SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y5, Y1
VPADDD (SP), Y1, Y1
VPXOR Y13, Y1, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VPADDD Y9, Y13, Y9
VPXOR Y5, Y9, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y2, Y6, Y2
VPADDD 32(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y6, Y2
VPADDD 288(SP), Y2, Y2
VPXOR Y14, Y2, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y10, Y14, Y10
VPXOR Y6, Y10, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y3, Y7, Y3
VPADDD 256(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y7, Y3
VPADDD 192(SP), Y3, Y3
VPXOR Y15, Y3, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y11, Y15, Y11
VPXOR Y7, Y11, Y7
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y0, Y5, Y0
VPADDD 448(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x0c, Y5, Y8
VPSLLD $0x14, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y0, Y5, Y0
VPADDD 320(SP), Y0, Y0
VPXOR Y15, Y0, Y15
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
VPADDD Y10, Y15, Y10
VPXOR Y5, Y10, Y5
VPSRLD $0x07, Y5, Y8
VPSLLD $0x19, Y5, Y5
VPOR Y5, Y8, Y5
VPADDD Y1, Y6, Y1
VPADDD 64(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x0c, Y6, Y8
VPSLLD $0x14, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y1, Y6, Y1
VPADDD 384(SP), Y1, Y1
VPXOR Y12, Y1, Y12
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
VPADDD Y11, Y12, Y11
VPXOR Y6, Y11, Y6
VPSRLD $0x07, Y6, Y8
VPSLLD $0x19, Y6, Y6
VPOR Y6, Y8, Y6
VPADDD Y2, Y7, Y2
VPADDD 96(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x0c, Y7, Y8
VPSLLD $0x14, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y2, Y7, Y2
VPADDD 128(SP), Y2, Y2
VPXOR Y13, Y2, Y13
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
VMOVDQU 512(SP), Y8
VPADDD Y8, Y13, Y8
VPXOR Y7, Y8, Y7
VMOVDQU Y8, 512(SP)
VPSRLD $0x07, Y7, Y8
VPSLLD $0x19, Y7, Y7
VPOR Y7, Y8, Y7
VPADDD Y3, Y4, Y3
VPADDD 224(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x0c, Y4, Y8
VPSLLD $0x14, Y4, Y4
VPOR Y4, Y8, Y4
VPADDD Y3, Y4, Y3
VPADDD 416(SP), Y3, Y3
VPXOR Y14, Y3, Y14
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
VPADDD Y9, Y14, Y9
VPXOR Y4, Y9, Y4
VPSRLD $0x07, Y4, Y8
VPSLLD $0x19, Y4, Y4
VPOR Y4, Y8, Y4
VMOVDQU 512(SP), Y8
// Finalize CVs
VPXOR Y0, Y8, Y0
VPXOR Y1, Y9, Y1
VPXOR Y2, Y10, Y2
VPXOR Y3, Y11, Y3
VPXOR Y4, Y12, Y4
VPXOR Y5, Y13, Y5
VPXOR Y6, Y14, Y6
VPXOR Y7, Y15, Y7
VPUNPCKLDQ Y1, Y0, Y8
VPUNPCKHDQ Y1, Y0, Y9
VPUNPCKLDQ Y3, Y2, Y10
VPUNPCKHDQ Y3, Y2, Y11
VPUNPCKLDQ Y5, Y4, Y12
VPUNPCKHDQ Y5, Y4, Y13
VPUNPCKLDQ Y7, Y6, Y14
VPUNPCKHDQ Y7, Y6, Y15
VPUNPCKLQDQ Y10, Y8, Y0
VPUNPCKHQDQ Y10, Y8, Y1
VPUNPCKLQDQ Y11, Y9, Y2
VPUNPCKHQDQ Y11, Y9, Y3
VPUNPCKLQDQ Y14, Y12, Y4
VPUNPCKHQDQ Y14, Y12, Y5
VPUNPCKLQDQ Y15, Y13, Y6
VPUNPCKHQDQ Y15, Y13, Y7
VPERM2I128 $0x20, Y4, Y0, Y8
VPERM2I128 $0x31, Y4, Y0, Y12
VPERM2I128 $0x20, Y5, Y1, Y9
VPERM2I128 $0x31, Y5, Y1, Y13
VPERM2I128 $0x20, Y6, Y2, Y10
VPERM2I128 $0x31, Y6, Y2, Y14
VPERM2I128 $0x20, Y7, Y3, Y11
VPERM2I128 $0x31, Y7, Y3, Y15
VMOVDQU Y8, (AX)
VMOVDQU Y9, 32(AX)
VMOVDQU Y10, 64(AX)
VMOVDQU Y11, 96(AX)
VMOVDQU Y12, 128(AX)
VMOVDQU Y13, 160(AX)
VMOVDQU Y14, 192(AX)
VMOVDQU Y15, 224(AX)
VZEROUPPER
RET