5565 lines
127 KiB
ArmAsm
5565 lines
127 KiB
ArmAsm
// Code generated by command: go run gen.go -out blake3_amd64.s. DO NOT EDIT.
|
|
|
|
#include "textflag.h"
|
|
|
|
DATA iv<>+0(SB)/4, $0x6a09e667
|
|
DATA iv<>+4(SB)/4, $0xbb67ae85
|
|
DATA iv<>+8(SB)/4, $0x3c6ef372
|
|
DATA iv<>+12(SB)/4, $0xa54ff53a
|
|
GLOBL iv<>(SB), RODATA|NOPTR, $16
|
|
|
|
DATA seq<>+0(SB)/4, $0x00000000
|
|
DATA seq<>+4(SB)/4, $0x00000001
|
|
DATA seq<>+8(SB)/4, $0x00000002
|
|
DATA seq<>+12(SB)/4, $0x00000003
|
|
DATA seq<>+16(SB)/4, $0x00000004
|
|
DATA seq<>+20(SB)/4, $0x00000005
|
|
DATA seq<>+24(SB)/4, $0x00000006
|
|
DATA seq<>+28(SB)/4, $0x00000007
|
|
DATA seq<>+32(SB)/4, $0x00000008
|
|
DATA seq<>+36(SB)/4, $0x00000009
|
|
DATA seq<>+40(SB)/4, $0x0000000a
|
|
DATA seq<>+44(SB)/4, $0x0000000b
|
|
DATA seq<>+48(SB)/4, $0x0000000c
|
|
DATA seq<>+52(SB)/4, $0x0000000d
|
|
DATA seq<>+56(SB)/4, $0x0000000e
|
|
DATA seq<>+60(SB)/4, $0x0000000f
|
|
GLOBL seq<>(SB), RODATA|NOPTR, $64
|
|
|
|
DATA seq64<>+0(SB)/8, $0x0000000000000000
|
|
DATA seq64<>+8(SB)/8, $0x0000000000000001
|
|
DATA seq64<>+16(SB)/8, $0x0000000000000002
|
|
DATA seq64<>+24(SB)/8, $0x0000000000000003
|
|
DATA seq64<>+32(SB)/8, $0x0000000000000004
|
|
DATA seq64<>+40(SB)/8, $0x0000000000000005
|
|
DATA seq64<>+48(SB)/8, $0x0000000000000006
|
|
DATA seq64<>+56(SB)/8, $0x0000000000000007
|
|
GLOBL seq64<>(SB), RODATA|NOPTR, $64
|
|
|
|
DATA shuffle_rot8<>+0(SB)/4, $0x00030201
|
|
DATA shuffle_rot8<>+4(SB)/4, $0x04070605
|
|
DATA shuffle_rot8<>+8(SB)/4, $0x080b0a09
|
|
DATA shuffle_rot8<>+12(SB)/4, $0x0c0f0e0d
|
|
DATA shuffle_rot8<>+16(SB)/4, $0x10131211
|
|
DATA shuffle_rot8<>+20(SB)/4, $0x14171615
|
|
DATA shuffle_rot8<>+24(SB)/4, $0x181b1a19
|
|
DATA shuffle_rot8<>+28(SB)/4, $0x1c1f1e1d
|
|
GLOBL shuffle_rot8<>(SB), RODATA|NOPTR, $32
|
|
|
|
DATA shuffle_rot16<>+0(SB)/4, $0x01000302
|
|
DATA shuffle_rot16<>+4(SB)/4, $0x05040706
|
|
DATA shuffle_rot16<>+8(SB)/4, $0x09080b0a
|
|
DATA shuffle_rot16<>+12(SB)/4, $0x0d0c0f0e
|
|
DATA shuffle_rot16<>+16(SB)/4, $0x11101312
|
|
DATA shuffle_rot16<>+20(SB)/4, $0x15141716
|
|
DATA shuffle_rot16<>+24(SB)/4, $0x19181b1a
|
|
DATA shuffle_rot16<>+28(SB)/4, $0x1d1c1f1e
|
|
GLOBL shuffle_rot16<>(SB), RODATA|NOPTR, $32
|
|
|
|
// func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
|
|
// Requires: AVX512BW, AVX512F
|
|
TEXT ·compressBlocksAVX512(SB), NOSPLIT, $0-40
|
|
MOVQ out+0(FP), AX
|
|
MOVQ block+8(FP), CX
|
|
MOVQ cv+16(FP), DX
|
|
|
|
// Initialize block vectors
|
|
VPBROADCASTD (CX), Z1
|
|
VPBROADCASTD 4(CX), Z3
|
|
VPBROADCASTD 8(CX), Z5
|
|
VPBROADCASTD 12(CX), Z7
|
|
VPBROADCASTD 16(CX), Z9
|
|
VPBROADCASTD 20(CX), Z11
|
|
VPBROADCASTD 24(CX), Z13
|
|
VPBROADCASTD 28(CX), Z15
|
|
VPBROADCASTD 32(CX), Z17
|
|
VPBROADCASTD 36(CX), Z19
|
|
VPBROADCASTD 40(CX), Z21
|
|
VPBROADCASTD 44(CX), Z23
|
|
VPBROADCASTD 48(CX), Z25
|
|
VPBROADCASTD 52(CX), Z27
|
|
VPBROADCASTD 56(CX), Z29
|
|
VPBROADCASTD 60(CX), Z31
|
|
|
|
// Initialize state vectors
|
|
VPBROADCASTD (DX), Z0
|
|
VPBROADCASTD 4(DX), Z2
|
|
VPBROADCASTD 8(DX), Z4
|
|
VPBROADCASTD 12(DX), Z6
|
|
VPBROADCASTD 16(DX), Z8
|
|
VPBROADCASTD 20(DX), Z10
|
|
VPBROADCASTD 24(DX), Z12
|
|
VPBROADCASTD 28(DX), Z14
|
|
VPBROADCASTD iv<>+0(SB), Z16
|
|
VPBROADCASTD iv<>+4(SB), Z18
|
|
VPBROADCASTD iv<>+8(SB), Z20
|
|
VPBROADCASTD iv<>+12(SB), Z22
|
|
VPBROADCASTD counter+24(FP), Z24
|
|
VPADDD seq<>+0(SB), Z24, Z24
|
|
VPCMPUD $0x01, seq<>+0(SB), Z24, K1
|
|
VPBROADCASTD counter+28(FP), Z26
|
|
VPADDD.BCST seq<>+4(SB), Z26, K1, Z26
|
|
VPBROADCASTD blockLen+32(FP), Z28
|
|
VPBROADCASTD flags+36(FP), Z30
|
|
|
|
// Round 1
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z1, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z3, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z5, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z7, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z9, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z11, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z13, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z15, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z17, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z19, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z21, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z23, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z25, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z27, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z29, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z31, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 2
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z5, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z13, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z7, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z21, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z15, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z1, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z9, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z27, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z3, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z23, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z25, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z11, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z19, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z29, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z31, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z17, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 3
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z7, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z9, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z21, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z25, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z27, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z5, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z15, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z29, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z13, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z11, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z19, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z1, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z23, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z31, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z17, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z3, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 4
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z21, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z15, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z25, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z19, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z29, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z7, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z27, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z31, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z9, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z1, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z23, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z5, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z11, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z17, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z3, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z13, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 5
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z25, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z27, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z19, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z23, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z31, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z21, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z29, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z17, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z15, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z5, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z11, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z7, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z1, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z3, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z13, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z9, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 6
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z19, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z29, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z23, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z11, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z17, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z25, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z31, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z3, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z27, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z7, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z1, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z21, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z5, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z13, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z9, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z15, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 7
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z23, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z31, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z11, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z1, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z3, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z19, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z17, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z13, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z29, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z21, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z5, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z25, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z7, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z9, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z15, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z27, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Finalize CVs
|
|
VPXORD Z0, Z16, Z0
|
|
VPXORD Z2, Z18, Z2
|
|
VPXORD Z4, Z20, Z4
|
|
VPXORD Z6, Z22, Z6
|
|
VPXORD Z8, Z24, Z8
|
|
VPXORD Z10, Z26, Z10
|
|
VPXORD Z12, Z28, Z12
|
|
VPXORD Z14, Z30, Z14
|
|
VPXORD.BCST (DX), Z16, Z16
|
|
VPXORD.BCST 4(DX), Z18, Z18
|
|
VPXORD.BCST 8(DX), Z20, Z20
|
|
VPXORD.BCST 12(DX), Z22, Z22
|
|
VPXORD.BCST 16(DX), Z24, Z24
|
|
VPXORD.BCST 20(DX), Z26, Z26
|
|
VPXORD.BCST 24(DX), Z28, Z28
|
|
VPXORD.BCST 28(DX), Z30, Z30
|
|
VMOVDQU32 seq<>+0(SB), Z1
|
|
VPSLLD $0x06, Z1, Z1
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z0, K1, (AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z2, K1, 4(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z4, K1, 8(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z6, K1, 12(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z8, K1, 16(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z10, K1, 20(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z12, K1, 24(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z14, K1, 28(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z16, K1, 32(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z18, K1, 36(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z20, K1, 40(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z22, K1, 44(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z24, K1, 48(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z26, K1, 52(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z28, K1, 56(AX)(Z1*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z30, K1, 60(AX)(Z1*1)
|
|
RET
|
|
|
|
// func compressChunksAVX512(cvs *[16][8]uint32, buf *[16384]byte, key *[8]uint32, counter uint64, flags uint32)
|
|
// Requires: AVX512BW, AVX512F
|
|
TEXT ·compressChunksAVX512(SB), NOSPLIT, $192-36
|
|
MOVQ cvs+0(FP), AX
|
|
MOVQ buf+8(FP), CX
|
|
MOVQ key+16(FP), DX
|
|
|
|
// Initialize counter
|
|
VPBROADCASTD counter+24(FP), Z0
|
|
VPADDD seq<>+0(SB), Z0, Z0
|
|
VPCMPUD $0x01, seq<>+0(SB), Z0, K1
|
|
VPBROADCASTD counter+28(FP), Z2
|
|
VPADDD.BCST seq<>+4(SB), Z2, K1, Z2
|
|
VMOVDQU32 Z0, (SP)
|
|
VMOVDQU32 Z2, 64(SP)
|
|
|
|
// Initialize flags
|
|
VPBROADCASTD flags+32(FP), Z0
|
|
VMOVDQU32 Z0, 128(SP)
|
|
ORL $0x01, 128(SP)
|
|
ORL $0x02, 188(SP)
|
|
|
|
// Load key
|
|
VPBROADCASTD (DX), Z0
|
|
VPBROADCASTD 4(DX), Z2
|
|
VPBROADCASTD 8(DX), Z4
|
|
VPBROADCASTD 12(DX), Z6
|
|
VPBROADCASTD 16(DX), Z8
|
|
VPBROADCASTD 20(DX), Z10
|
|
VPBROADCASTD 24(DX), Z12
|
|
VPBROADCASTD 28(DX), Z14
|
|
|
|
// Loop index
|
|
XORQ DX, DX
|
|
|
|
loop:
|
|
// Load transposed block
|
|
VMOVDQU32 seq<>+0(SB), Z16
|
|
VPSLLD $0x0a, Z16, Z16
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD (CX)(Z16*1), K1, Z1
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 4(CX)(Z16*1), K1, Z3
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 8(CX)(Z16*1), K1, Z5
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 12(CX)(Z16*1), K1, Z7
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 16(CX)(Z16*1), K1, Z9
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 20(CX)(Z16*1), K1, Z11
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 24(CX)(Z16*1), K1, Z13
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 28(CX)(Z16*1), K1, Z15
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 32(CX)(Z16*1), K1, Z17
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 36(CX)(Z16*1), K1, Z19
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 40(CX)(Z16*1), K1, Z21
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 44(CX)(Z16*1), K1, Z23
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 48(CX)(Z16*1), K1, Z25
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 52(CX)(Z16*1), K1, Z27
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 56(CX)(Z16*1), K1, Z29
|
|
KXNORD K1, K1, K1
|
|
VPGATHERDD 60(CX)(Z16*1), K1, Z31
|
|
ADDQ $0x40, CX
|
|
|
|
// Reload state vectors (other than CVs)
|
|
VPBROADCASTD iv<>+0(SB), Z16
|
|
VPBROADCASTD iv<>+4(SB), Z18
|
|
VPBROADCASTD iv<>+8(SB), Z20
|
|
VPBROADCASTD iv<>+12(SB), Z22
|
|
VMOVDQU32 (SP), Z24
|
|
VMOVDQU32 64(SP), Z26
|
|
VPBROADCASTD seq<>+4(SB), Z28
|
|
VPSLLD $0x06, Z28, Z28
|
|
VPBROADCASTD 128(SP)(DX*4), Z30
|
|
|
|
// Round 1
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z1, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z3, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z5, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z7, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z9, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z11, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z13, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z15, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z17, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z19, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z21, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z23, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z25, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z27, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z29, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z31, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 2
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z5, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z13, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z7, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z21, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z15, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z1, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z9, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z27, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z3, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z23, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z25, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z11, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z19, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z29, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z31, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z17, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 3
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z7, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z9, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z21, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z25, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z27, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z5, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z15, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z29, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z13, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z11, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z19, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z1, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z23, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z31, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z17, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z3, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 4
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z21, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z15, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z25, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z19, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z29, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z7, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z27, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z31, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z9, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z1, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z23, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z5, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z11, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z17, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z3, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z13, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 5
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z25, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z27, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z19, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z23, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z31, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z21, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z29, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z17, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z15, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z5, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z11, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z7, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z1, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z3, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z13, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z9, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 6
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z19, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z29, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z23, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z11, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z17, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z25, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z31, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z3, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z27, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z7, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z1, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z21, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z5, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z13, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z9, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z15, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Round 7
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z23, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z0, Z8, Z0
|
|
VPADDD Z31, Z0, Z0
|
|
VPXORD Z24, Z0, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z16, Z24, Z16
|
|
VPXORD Z8, Z16, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z11, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z2, Z10, Z2
|
|
VPADDD Z1, Z2, Z2
|
|
VPXORD Z26, Z2, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z18, Z26, Z18
|
|
VPXORD Z10, Z18, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z3, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z4, Z12, Z4
|
|
VPADDD Z19, Z4, Z4
|
|
VPXORD Z28, Z4, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z20, Z28, Z20
|
|
VPXORD Z12, Z20, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z17, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z6, Z14, Z6
|
|
VPADDD Z13, Z6, Z6
|
|
VPXORD Z30, Z6, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z22, Z30, Z22
|
|
VPXORD Z14, Z22, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z29, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x10, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x0c, Z10, Z10
|
|
VPADDD Z0, Z10, Z0
|
|
VPADDD Z21, Z0, Z0
|
|
VPXORD Z30, Z0, Z30
|
|
VPRORD $0x08, Z30, Z30
|
|
VPADDD Z20, Z30, Z20
|
|
VPXORD Z10, Z20, Z10
|
|
VPRORD $0x07, Z10, Z10
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z5, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x10, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x0c, Z12, Z12
|
|
VPADDD Z2, Z12, Z2
|
|
VPADDD Z25, Z2, Z2
|
|
VPXORD Z24, Z2, Z24
|
|
VPRORD $0x08, Z24, Z24
|
|
VPADDD Z22, Z24, Z22
|
|
VPXORD Z12, Z22, Z12
|
|
VPRORD $0x07, Z12, Z12
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z7, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x10, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x0c, Z14, Z14
|
|
VPADDD Z4, Z14, Z4
|
|
VPADDD Z9, Z4, Z4
|
|
VPXORD Z26, Z4, Z26
|
|
VPRORD $0x08, Z26, Z26
|
|
VPADDD Z16, Z26, Z16
|
|
VPXORD Z14, Z16, Z14
|
|
VPRORD $0x07, Z14, Z14
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z15, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x10, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x0c, Z8, Z8
|
|
VPADDD Z6, Z8, Z6
|
|
VPADDD Z27, Z6, Z6
|
|
VPXORD Z28, Z6, Z28
|
|
VPRORD $0x08, Z28, Z28
|
|
VPADDD Z18, Z28, Z18
|
|
VPXORD Z8, Z18, Z8
|
|
VPRORD $0x07, Z8, Z8
|
|
|
|
// Finalize CVs
|
|
VPXORD Z0, Z16, Z0
|
|
VPXORD Z2, Z18, Z2
|
|
VPXORD Z4, Z20, Z4
|
|
VPXORD Z6, Z22, Z6
|
|
VPXORD Z8, Z24, Z8
|
|
VPXORD Z10, Z26, Z10
|
|
VPXORD Z12, Z28, Z12
|
|
VPXORD Z14, Z30, Z14
|
|
|
|
// Loop
|
|
INCQ DX
|
|
CMPQ DX, $0x00000010
|
|
JNE loop
|
|
|
|
// Finished; transpose CVs
|
|
VMOVDQU32 seq<>+0(SB), Z16
|
|
VPSLLD $0x05, Z16, Z16
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z0, K1, (AX)(Z16*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z2, K1, 4(AX)(Z16*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z4, K1, 8(AX)(Z16*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z6, K1, 12(AX)(Z16*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z8, K1, 16(AX)(Z16*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z10, K1, 20(AX)(Z16*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z12, K1, 24(AX)(Z16*1)
|
|
KXNORD K1, K1, K1
|
|
VPSCATTERDD Z14, K1, 28(AX)(Z16*1)
|
|
RET
|
|
|
|
// func compressBlocksAVX2(out *[512]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
|
|
// Requires: AVX, AVX2
|
|
TEXT ·compressBlocksAVX2(SB), NOSPLIT, $544-40
|
|
MOVQ out+0(FP), AX
|
|
MOVQ block+8(FP), CX
|
|
MOVQ cv+16(FP), DX
|
|
|
|
// Load block
|
|
VPBROADCASTD (CX), Y0
|
|
VMOVDQU Y0, (SP)
|
|
VPBROADCASTD 4(CX), Y0
|
|
VMOVDQU Y0, 32(SP)
|
|
VPBROADCASTD 8(CX), Y0
|
|
VMOVDQU Y0, 64(SP)
|
|
VPBROADCASTD 12(CX), Y0
|
|
VMOVDQU Y0, 96(SP)
|
|
VPBROADCASTD 16(CX), Y0
|
|
VMOVDQU Y0, 128(SP)
|
|
VPBROADCASTD 20(CX), Y0
|
|
VMOVDQU Y0, 160(SP)
|
|
VPBROADCASTD 24(CX), Y0
|
|
VMOVDQU Y0, 192(SP)
|
|
VPBROADCASTD 28(CX), Y0
|
|
VMOVDQU Y0, 224(SP)
|
|
VPBROADCASTD 32(CX), Y0
|
|
VMOVDQU Y0, 256(SP)
|
|
VPBROADCASTD 36(CX), Y0
|
|
VMOVDQU Y0, 288(SP)
|
|
VPBROADCASTD 40(CX), Y0
|
|
VMOVDQU Y0, 320(SP)
|
|
VPBROADCASTD 44(CX), Y0
|
|
VMOVDQU Y0, 352(SP)
|
|
VPBROADCASTD 48(CX), Y0
|
|
VMOVDQU Y0, 384(SP)
|
|
VPBROADCASTD 52(CX), Y0
|
|
VMOVDQU Y0, 416(SP)
|
|
VPBROADCASTD 56(CX), Y0
|
|
VMOVDQU Y0, 448(SP)
|
|
VPBROADCASTD 60(CX), Y0
|
|
VMOVDQU Y0, 480(SP)
|
|
|
|
// Initialize state vectors
|
|
VPBROADCASTD (DX), Y0
|
|
VPBROADCASTD 4(DX), Y1
|
|
VPBROADCASTD 8(DX), Y2
|
|
VPBROADCASTD 12(DX), Y3
|
|
VPBROADCASTD 16(DX), Y4
|
|
VPBROADCASTD 20(DX), Y5
|
|
VPBROADCASTD 24(DX), Y6
|
|
VPBROADCASTD 28(DX), Y7
|
|
VPBROADCASTD iv<>+0(SB), Y8
|
|
VPBROADCASTD iv<>+4(SB), Y9
|
|
VPBROADCASTD iv<>+8(SB), Y10
|
|
VPBROADCASTD iv<>+12(SB), Y11
|
|
VPBROADCASTQ counter+24(FP), Y12
|
|
VPBROADCASTQ counter+24(FP), Y13
|
|
VPADDQ seq64<>+0(SB), Y12, Y12
|
|
VPADDQ seq64<>+32(SB), Y13, Y13
|
|
VPUNPCKLDQ Y13, Y12, Y14
|
|
VPUNPCKHDQ Y13, Y12, Y15
|
|
VPUNPCKLDQ Y15, Y14, Y12
|
|
VPUNPCKHDQ Y15, Y14, Y13
|
|
VPERMQ $0xd8, Y12, Y12
|
|
VPERMQ $0xd8, Y13, Y13
|
|
VPBROADCASTD blockLen+32(FP), Y14
|
|
VPBROADCASTD flags+36(FP), Y15
|
|
VMOVDQU Y8, 512(SP)
|
|
|
|
// Round 1
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD (SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 32(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 64(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 96(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 128(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 160(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 256(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 288(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 384(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 416(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 448(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 2
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 64(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 192(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 96(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 224(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD (SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 128(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 416(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 32(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 352(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 288(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 448(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 3
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 96(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 128(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 416(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 64(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 448(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 192(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 160(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 288(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD (SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 352(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 480(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 32(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 320(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 224(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 288(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 448(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 96(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 416(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 128(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD (SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 64(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 160(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 256(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 32(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 5
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 384(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 416(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 288(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 480(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 320(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 448(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 224(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 64(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 96(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD (SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 32(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 128(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 6
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 288(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 448(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 256(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 384(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 32(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 416(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 96(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD (SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 64(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 192(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 128(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 7
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 352(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 480(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD (SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 32(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 288(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 448(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 320(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 64(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 96(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 128(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 416(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VMOVDQU 512(SP), Y8
|
|
|
|
// Finalize CVs
|
|
VMOVDQU Y8, 256(SP)
|
|
VMOVDQU Y9, 288(SP)
|
|
VMOVDQU Y10, 320(SP)
|
|
VMOVDQU Y11, 352(SP)
|
|
VMOVDQU Y12, 384(SP)
|
|
VMOVDQU Y13, 416(SP)
|
|
VMOVDQU Y14, 448(SP)
|
|
VMOVDQU Y15, 480(SP)
|
|
VPXOR Y0, Y8, Y0
|
|
VPXOR Y1, Y9, Y1
|
|
VPXOR Y2, Y10, Y2
|
|
VPXOR Y3, Y11, Y3
|
|
VPXOR Y4, Y12, Y4
|
|
VPXOR Y5, Y13, Y5
|
|
VPXOR Y6, Y14, Y6
|
|
VPXOR Y7, Y15, Y7
|
|
VPUNPCKLDQ Y1, Y0, Y8
|
|
VPUNPCKHDQ Y1, Y0, Y9
|
|
VPUNPCKLDQ Y3, Y2, Y10
|
|
VPUNPCKHDQ Y3, Y2, Y11
|
|
VPUNPCKLDQ Y5, Y4, Y12
|
|
VPUNPCKHDQ Y5, Y4, Y13
|
|
VPUNPCKLDQ Y7, Y6, Y14
|
|
VPUNPCKHDQ Y7, Y6, Y15
|
|
VPUNPCKLQDQ Y10, Y8, Y0
|
|
VPUNPCKHQDQ Y10, Y8, Y1
|
|
VPUNPCKLQDQ Y11, Y9, Y2
|
|
VPUNPCKHQDQ Y11, Y9, Y3
|
|
VPUNPCKLQDQ Y14, Y12, Y4
|
|
VPUNPCKHQDQ Y14, Y12, Y5
|
|
VPUNPCKLQDQ Y15, Y13, Y6
|
|
VPUNPCKHQDQ Y15, Y13, Y7
|
|
VPERM2I128 $0x20, Y4, Y0, Y8
|
|
VPERM2I128 $0x31, Y4, Y0, Y12
|
|
VPERM2I128 $0x20, Y5, Y1, Y9
|
|
VPERM2I128 $0x31, Y5, Y1, Y13
|
|
VPERM2I128 $0x20, Y6, Y2, Y10
|
|
VPERM2I128 $0x31, Y6, Y2, Y14
|
|
VPERM2I128 $0x20, Y7, Y3, Y11
|
|
VPERM2I128 $0x31, Y7, Y3, Y15
|
|
VMOVDQU Y8, (AX)
|
|
VMOVDQU Y9, 64(AX)
|
|
VMOVDQU Y10, 128(AX)
|
|
VMOVDQU Y11, 192(AX)
|
|
VMOVDQU Y12, 256(AX)
|
|
VMOVDQU Y13, 320(AX)
|
|
VMOVDQU Y14, 384(AX)
|
|
VMOVDQU Y15, 448(AX)
|
|
VMOVDQU 256(SP), Y8
|
|
VMOVDQU 288(SP), Y9
|
|
VMOVDQU 320(SP), Y10
|
|
VMOVDQU 352(SP), Y11
|
|
VMOVDQU 384(SP), Y12
|
|
VMOVDQU 416(SP), Y13
|
|
VMOVDQU 448(SP), Y14
|
|
VMOVDQU 480(SP), Y15
|
|
VPBROADCASTD (DX), Y0
|
|
VPXOR Y0, Y8, Y8
|
|
VPBROADCASTD 4(DX), Y0
|
|
VPXOR Y0, Y9, Y9
|
|
VPBROADCASTD 8(DX), Y0
|
|
VPXOR Y0, Y10, Y10
|
|
VPBROADCASTD 12(DX), Y0
|
|
VPXOR Y0, Y11, Y11
|
|
VPBROADCASTD 16(DX), Y0
|
|
VPXOR Y0, Y12, Y12
|
|
VPBROADCASTD 20(DX), Y0
|
|
VPXOR Y0, Y13, Y13
|
|
VPBROADCASTD 24(DX), Y0
|
|
VPXOR Y0, Y14, Y14
|
|
VPBROADCASTD 28(DX), Y0
|
|
VPXOR Y0, Y15, Y15
|
|
VPUNPCKLDQ Y9, Y8, Y0
|
|
VPUNPCKHDQ Y9, Y8, Y1
|
|
VPUNPCKLDQ Y11, Y10, Y2
|
|
VPUNPCKHDQ Y11, Y10, Y3
|
|
VPUNPCKLDQ Y13, Y12, Y4
|
|
VPUNPCKHDQ Y13, Y12, Y5
|
|
VPUNPCKLDQ Y15, Y14, Y6
|
|
VPUNPCKHDQ Y15, Y14, Y7
|
|
VPUNPCKLQDQ Y2, Y0, Y8
|
|
VPUNPCKHQDQ Y2, Y0, Y9
|
|
VPUNPCKLQDQ Y3, Y1, Y10
|
|
VPUNPCKHQDQ Y3, Y1, Y11
|
|
VPUNPCKLQDQ Y6, Y4, Y12
|
|
VPUNPCKHQDQ Y6, Y4, Y13
|
|
VPUNPCKLQDQ Y7, Y5, Y14
|
|
VPUNPCKHQDQ Y7, Y5, Y15
|
|
VPERM2I128 $0x20, Y12, Y8, Y0
|
|
VPERM2I128 $0x31, Y12, Y8, Y4
|
|
VPERM2I128 $0x20, Y13, Y9, Y1
|
|
VPERM2I128 $0x31, Y13, Y9, Y5
|
|
VPERM2I128 $0x20, Y14, Y10, Y2
|
|
VPERM2I128 $0x31, Y14, Y10, Y6
|
|
VPERM2I128 $0x20, Y15, Y11, Y3
|
|
VPERM2I128 $0x31, Y15, Y11, Y7
|
|
VMOVDQU Y0, 32(AX)
|
|
VMOVDQU Y1, 96(AX)
|
|
VMOVDQU Y2, 160(AX)
|
|
VMOVDQU Y3, 224(AX)
|
|
VMOVDQU Y4, 288(AX)
|
|
VMOVDQU Y5, 352(AX)
|
|
VMOVDQU Y6, 416(AX)
|
|
VMOVDQU Y7, 480(AX)
|
|
RET
|
|
|
|
// func compressChunksAVX2(cvs *[8][8]uint32, buf *[8192]byte, key *[8]uint32, counter uint64, flags uint32)
|
|
// Requires: AVX, AVX2
|
|
TEXT ·compressChunksAVX2(SB), NOSPLIT, $672-36
|
|
MOVQ cvs+0(FP), AX
|
|
MOVQ buf+8(FP), CX
|
|
MOVQ key+16(FP), DX
|
|
|
|
// Load key
|
|
VPBROADCASTD (DX), Y0
|
|
VPBROADCASTD 4(DX), Y1
|
|
VPBROADCASTD 8(DX), Y2
|
|
VPBROADCASTD 12(DX), Y3
|
|
VPBROADCASTD 16(DX), Y4
|
|
VPBROADCASTD 20(DX), Y5
|
|
VPBROADCASTD 24(DX), Y6
|
|
VPBROADCASTD 28(DX), Y7
|
|
|
|
// Initialize counter
|
|
VPBROADCASTQ counter+24(FP), Y12
|
|
VPBROADCASTQ counter+24(FP), Y13
|
|
VPADDQ seq64<>+0(SB), Y12, Y12
|
|
VPADDQ seq64<>+32(SB), Y13, Y13
|
|
VPUNPCKLDQ Y13, Y12, Y14
|
|
VPUNPCKHDQ Y13, Y12, Y15
|
|
VPUNPCKLDQ Y15, Y14, Y12
|
|
VPUNPCKHDQ Y15, Y14, Y13
|
|
VPERMQ $0xd8, Y12, Y12
|
|
VPERMQ $0xd8, Y13, Y13
|
|
VMOVDQU Y12, 512(SP)
|
|
VMOVDQU Y13, 544(SP)
|
|
|
|
// Initialize flags
|
|
VPBROADCASTD flags+32(FP), Y14
|
|
VMOVDQU Y14, 576(SP)
|
|
VMOVDQU Y14, 608(SP)
|
|
ORL $0x01, 576(SP)
|
|
ORL $0x02, 636(SP)
|
|
|
|
// Loop index
|
|
XORQ DX, DX
|
|
|
|
loop:
|
|
// Load transposed block
|
|
VMOVDQU seq<>+0(SB), Y9
|
|
VPSLLD $0x0a, Y9, Y9
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, (CX)(Y9*1), Y10
|
|
VMOVDQU Y10, (SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 4(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 32(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 8(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 64(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 12(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 96(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 16(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 128(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 20(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 160(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 24(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 192(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 28(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 224(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 32(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 256(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 36(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 288(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 40(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 320(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 44(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 352(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 48(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 384(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 52(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 416(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 56(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 448(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 60(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 480(SP)
|
|
ADDQ $0x40, CX
|
|
|
|
// Reload state vectors (other than CVs)
|
|
VPBROADCASTD iv<>+0(SB), Y8
|
|
VPBROADCASTD iv<>+4(SB), Y9
|
|
VPBROADCASTD iv<>+8(SB), Y10
|
|
VPBROADCASTD iv<>+12(SB), Y11
|
|
VMOVDQU 512(SP), Y12
|
|
VMOVDQU 544(SP), Y13
|
|
VPBROADCASTD seq<>+4(SB), Y14
|
|
VPSLLD $0x06, Y14, Y14
|
|
VPBROADCASTD 576(SP)(DX*4), Y15
|
|
VMOVDQU Y8, 640(SP)
|
|
|
|
// Round 1
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD (SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 32(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 64(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 96(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 128(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 160(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 256(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 288(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 384(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 416(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 448(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 2
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 64(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 192(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 96(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 224(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD (SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 128(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 416(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 32(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 352(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 288(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 448(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 3
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 96(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 128(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 416(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 64(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 448(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 192(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 160(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 288(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD (SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 352(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 480(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 32(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 320(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 224(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 288(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 448(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 96(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 416(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 128(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD (SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 64(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 160(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 256(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 32(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 5
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 384(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 416(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 288(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 480(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 320(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 448(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 224(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 64(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 96(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD (SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 32(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 128(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 6
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 288(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 448(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 256(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 384(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 32(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 416(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 96(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD (SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 64(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 192(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 128(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 7
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 352(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 480(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD (SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 32(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 288(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 448(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 320(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 64(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 96(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 128(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 640(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 640(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 416(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VMOVDQU 640(SP), Y8
|
|
|
|
// Finalize CVs
|
|
VPXOR Y0, Y8, Y0
|
|
VPXOR Y1, Y9, Y1
|
|
VPXOR Y2, Y10, Y2
|
|
VPXOR Y3, Y11, Y3
|
|
VPXOR Y4, Y12, Y4
|
|
VPXOR Y5, Y13, Y5
|
|
VPXOR Y6, Y14, Y6
|
|
VPXOR Y7, Y15, Y7
|
|
|
|
// Loop
|
|
INCQ DX
|
|
CMPQ DX, $0x00000010
|
|
JNE loop
|
|
|
|
// Finished; transpose CVs
|
|
VPUNPCKLDQ Y1, Y0, Y8
|
|
VPUNPCKHDQ Y1, Y0, Y9
|
|
VPUNPCKLDQ Y3, Y2, Y10
|
|
VPUNPCKHDQ Y3, Y2, Y11
|
|
VPUNPCKLDQ Y5, Y4, Y12
|
|
VPUNPCKHDQ Y5, Y4, Y13
|
|
VPUNPCKLDQ Y7, Y6, Y14
|
|
VPUNPCKHDQ Y7, Y6, Y15
|
|
VPUNPCKLQDQ Y10, Y8, Y0
|
|
VPUNPCKHQDQ Y10, Y8, Y1
|
|
VPUNPCKLQDQ Y11, Y9, Y2
|
|
VPUNPCKHQDQ Y11, Y9, Y3
|
|
VPUNPCKLQDQ Y14, Y12, Y4
|
|
VPUNPCKHQDQ Y14, Y12, Y5
|
|
VPUNPCKLQDQ Y15, Y13, Y6
|
|
VPUNPCKHQDQ Y15, Y13, Y7
|
|
VPERM2I128 $0x20, Y4, Y0, Y8
|
|
VPERM2I128 $0x31, Y4, Y0, Y12
|
|
VPERM2I128 $0x20, Y5, Y1, Y9
|
|
VPERM2I128 $0x31, Y5, Y1, Y13
|
|
VPERM2I128 $0x20, Y6, Y2, Y10
|
|
VPERM2I128 $0x31, Y6, Y2, Y14
|
|
VPERM2I128 $0x20, Y7, Y3, Y11
|
|
VPERM2I128 $0x31, Y7, Y3, Y15
|
|
VMOVDQU Y8, (AX)
|
|
VMOVDQU Y9, 32(AX)
|
|
VMOVDQU Y10, 64(AX)
|
|
VMOVDQU Y11, 96(AX)
|
|
VMOVDQU Y12, 128(AX)
|
|
VMOVDQU Y13, 160(AX)
|
|
VMOVDQU Y14, 192(AX)
|
|
VMOVDQU Y15, 224(AX)
|
|
RET
|
|
|
|
// func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
|
|
// Requires: AVX, AVX2
|
|
TEXT ·compressParentsAVX2(SB), NOSPLIT, $544-28
|
|
MOVQ parents+0(FP), AX
|
|
MOVQ cvs+8(FP), CX
|
|
MOVQ key+16(FP), DX
|
|
|
|
// Load transposed block
|
|
VMOVDQU seq<>+0(SB), Y9
|
|
VPSLLD $0x06, Y9, Y9
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, (CX)(Y9*1), Y10
|
|
VMOVDQU Y10, (SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 4(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 32(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 8(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 64(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 12(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 96(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 16(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 128(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 20(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 160(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 24(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 192(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 28(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 224(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 32(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 256(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 36(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 288(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 40(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 320(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 44(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 352(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 48(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 384(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 52(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 416(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 56(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 448(SP)
|
|
VPCMPEQD Y8, Y8, Y8
|
|
VPGATHERDD Y8, 60(CX)(Y9*1), Y10
|
|
VMOVDQU Y10, 480(SP)
|
|
|
|
// Initialize state vectors
|
|
VPBROADCASTD (DX), Y0
|
|
VPBROADCASTD 4(DX), Y1
|
|
VPBROADCASTD 8(DX), Y2
|
|
VPBROADCASTD 12(DX), Y3
|
|
VPBROADCASTD 16(DX), Y4
|
|
VPBROADCASTD 20(DX), Y5
|
|
VPBROADCASTD 24(DX), Y6
|
|
VPBROADCASTD 28(DX), Y7
|
|
VPBROADCASTD iv<>+0(SB), Y8
|
|
VPBROADCASTD iv<>+4(SB), Y9
|
|
VPBROADCASTD iv<>+8(SB), Y10
|
|
VPBROADCASTD iv<>+12(SB), Y11
|
|
VPXOR Y12, Y12, Y12
|
|
VPXOR Y13, Y13, Y13
|
|
VPBROADCASTD seq<>+4(SB), Y14
|
|
VPSLLD $0x06, Y14, Y14
|
|
ORL $0x04, flags+24(FP)
|
|
VPBROADCASTD flags+24(FP), Y15
|
|
VMOVDQU Y8, 512(SP)
|
|
|
|
// Round 1
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD (SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 32(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 64(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 96(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 128(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 160(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 256(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 288(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 384(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 416(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 448(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 2
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 64(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 192(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 96(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 224(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD (SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 128(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 416(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 32(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 352(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 288(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 448(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 3
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 96(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 128(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 416(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 64(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 448(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 192(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 160(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 288(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD (SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 352(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 480(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 32(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 320(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 224(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 288(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 448(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 96(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 416(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 128(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD (SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 64(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 160(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 256(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 32(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 5
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 384(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 416(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 288(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 480(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 320(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 448(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 224(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 64(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 96(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD (SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 32(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 128(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 6
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 288(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 448(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 352(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 256(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 384(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 480(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 32(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 416(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 96(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD (SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 320(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 64(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 192(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 128(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
|
|
// Round 7
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 352(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y0, Y4, Y0
|
|
VPADDD 480(SP), Y0, Y0
|
|
VPXOR Y12, Y0, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y12, Y8
|
|
VPXOR Y4, Y8, Y4
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD 160(SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y5, Y1
|
|
VPADDD (SP), Y1, Y1
|
|
VPXOR Y13, Y1, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VPADDD Y9, Y13, Y9
|
|
VPXOR Y5, Y9, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 32(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y6, Y2
|
|
VPADDD 288(SP), Y2, Y2
|
|
VPXOR Y14, Y2, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y10, Y14, Y10
|
|
VPXOR Y6, Y10, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 256(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y7, Y3
|
|
VPADDD 192(SP), Y3, Y3
|
|
VPXOR Y15, Y3, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y11, Y15, Y11
|
|
VPXOR Y7, Y11, Y7
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 448(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x0c, Y5, Y8
|
|
VPSLLD $0x14, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y0, Y5, Y0
|
|
VPADDD 320(SP), Y0, Y0
|
|
VPXOR Y15, Y0, Y15
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y15, Y15
|
|
VPADDD Y10, Y15, Y10
|
|
VPXOR Y5, Y10, Y5
|
|
VPSRLD $0x07, Y5, Y8
|
|
VPSLLD $0x19, Y5, Y5
|
|
VPOR Y5, Y8, Y5
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 64(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x0c, Y6, Y8
|
|
VPSLLD $0x14, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y1, Y6, Y1
|
|
VPADDD 384(SP), Y1, Y1
|
|
VPXOR Y12, Y1, Y12
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y12, Y12
|
|
VPADDD Y11, Y12, Y11
|
|
VPXOR Y6, Y11, Y6
|
|
VPSRLD $0x07, Y6, Y8
|
|
VPSLLD $0x19, Y6, Y6
|
|
VPOR Y6, Y8, Y6
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 96(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x0c, Y7, Y8
|
|
VPSLLD $0x14, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y2, Y7, Y2
|
|
VPADDD 128(SP), Y2, Y2
|
|
VPXOR Y13, Y2, Y13
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y13, Y13
|
|
VMOVDQU 512(SP), Y8
|
|
VPADDD Y8, Y13, Y8
|
|
VPXOR Y7, Y8, Y7
|
|
VMOVDQU Y8, 512(SP)
|
|
VPSRLD $0x07, Y7, Y8
|
|
VPSLLD $0x19, Y7, Y7
|
|
VPOR Y7, Y8, Y7
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 224(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot16<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x0c, Y4, Y8
|
|
VPSLLD $0x14, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VPADDD Y3, Y4, Y3
|
|
VPADDD 416(SP), Y3, Y3
|
|
VPXOR Y14, Y3, Y14
|
|
VPSHUFB shuffle_rot8<>+0(SB), Y14, Y14
|
|
VPADDD Y9, Y14, Y9
|
|
VPXOR Y4, Y9, Y4
|
|
VPSRLD $0x07, Y4, Y8
|
|
VPSLLD $0x19, Y4, Y4
|
|
VPOR Y4, Y8, Y4
|
|
VMOVDQU 512(SP), Y8
|
|
|
|
// Finalize CVs
|
|
VPXOR Y0, Y8, Y0
|
|
VPXOR Y1, Y9, Y1
|
|
VPXOR Y2, Y10, Y2
|
|
VPXOR Y3, Y11, Y3
|
|
VPXOR Y4, Y12, Y4
|
|
VPXOR Y5, Y13, Y5
|
|
VPXOR Y6, Y14, Y6
|
|
VPXOR Y7, Y15, Y7
|
|
VPUNPCKLDQ Y1, Y0, Y8
|
|
VPUNPCKHDQ Y1, Y0, Y9
|
|
VPUNPCKLDQ Y3, Y2, Y10
|
|
VPUNPCKHDQ Y3, Y2, Y11
|
|
VPUNPCKLDQ Y5, Y4, Y12
|
|
VPUNPCKHDQ Y5, Y4, Y13
|
|
VPUNPCKLDQ Y7, Y6, Y14
|
|
VPUNPCKHDQ Y7, Y6, Y15
|
|
VPUNPCKLQDQ Y10, Y8, Y0
|
|
VPUNPCKHQDQ Y10, Y8, Y1
|
|
VPUNPCKLQDQ Y11, Y9, Y2
|
|
VPUNPCKHQDQ Y11, Y9, Y3
|
|
VPUNPCKLQDQ Y14, Y12, Y4
|
|
VPUNPCKHQDQ Y14, Y12, Y5
|
|
VPUNPCKLQDQ Y15, Y13, Y6
|
|
VPUNPCKHQDQ Y15, Y13, Y7
|
|
VPERM2I128 $0x20, Y4, Y0, Y8
|
|
VPERM2I128 $0x31, Y4, Y0, Y12
|
|
VPERM2I128 $0x20, Y5, Y1, Y9
|
|
VPERM2I128 $0x31, Y5, Y1, Y13
|
|
VPERM2I128 $0x20, Y6, Y2, Y10
|
|
VPERM2I128 $0x31, Y6, Y2, Y14
|
|
VPERM2I128 $0x20, Y7, Y3, Y11
|
|
VPERM2I128 $0x31, Y7, Y3, Y15
|
|
VMOVDQU Y8, (AX)
|
|
VMOVDQU Y9, 32(AX)
|
|
VMOVDQU Y10, 64(AX)
|
|
VMOVDQU Y11, 96(AX)
|
|
VMOVDQU Y12, 128(AX)
|
|
VMOVDQU Y13, 160(AX)
|
|
VMOVDQU Y14, 192(AX)
|
|
VMOVDQU Y15, 224(AX)
|
|
RET
|