145 lines
4.1 KiB
ArmAsm
145 lines
4.1 KiB
ArmAsm
// Code generated by command: go run decode_asm.go -pkg base64 -out ../base64/decode_amd64.s -stubs ../base64/decode_amd64.go. DO NOT EDIT.
|
|
|
|
//go:build !purego
|
|
// +build !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
DATA b64_dec_lut_hi<>+0(SB)/8, $0x0804080402011010
|
|
DATA b64_dec_lut_hi<>+8(SB)/8, $0x1010101010101010
|
|
DATA b64_dec_lut_hi<>+16(SB)/8, $0x0804080402011010
|
|
DATA b64_dec_lut_hi<>+24(SB)/8, $0x1010101010101010
|
|
GLOBL b64_dec_lut_hi<>(SB), RODATA|NOPTR, $32
|
|
|
|
DATA b64_dec_madd1<>+0(SB)/8, $0x0140014001400140
|
|
DATA b64_dec_madd1<>+8(SB)/8, $0x0140014001400140
|
|
DATA b64_dec_madd1<>+16(SB)/8, $0x0140014001400140
|
|
DATA b64_dec_madd1<>+24(SB)/8, $0x0140014001400140
|
|
GLOBL b64_dec_madd1<>(SB), RODATA|NOPTR, $32
|
|
|
|
DATA b64_dec_madd2<>+0(SB)/8, $0x0001100000011000
|
|
DATA b64_dec_madd2<>+8(SB)/8, $0x0001100000011000
|
|
DATA b64_dec_madd2<>+16(SB)/8, $0x0001100000011000
|
|
DATA b64_dec_madd2<>+24(SB)/8, $0x0001100000011000
|
|
GLOBL b64_dec_madd2<>(SB), RODATA|NOPTR, $32
|
|
|
|
DATA b64_dec_shuf_lo<>+0(SB)/8, $0x0000000000000000
|
|
DATA b64_dec_shuf_lo<>+8(SB)/8, $0x0600010200000000
|
|
GLOBL b64_dec_shuf_lo<>(SB), RODATA|NOPTR, $16
|
|
|
|
DATA b64_dec_shuf<>+0(SB)/8, $0x090a040506000102
|
|
DATA b64_dec_shuf<>+8(SB)/8, $0x000000000c0d0e08
|
|
DATA b64_dec_shuf<>+16(SB)/8, $0x0c0d0e08090a0405
|
|
DATA b64_dec_shuf<>+24(SB)/8, $0x0000000000000000
|
|
GLOBL b64_dec_shuf<>(SB), RODATA|NOPTR, $32
|
|
|
|
// func decodeAVX2(dst []byte, src []byte, lut *int8) (int, int)
|
|
// Requires: AVX, AVX2, SSE4.1
|
|
TEXT ·decodeAVX2(SB), NOSPLIT, $0-72
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ src_base+24(FP), DX
|
|
MOVQ lut+48(FP), SI
|
|
MOVQ src_len+32(FP), DI
|
|
MOVB $0x2f, CL
|
|
PINSRB $0x00, CX, X8
|
|
VPBROADCASTB X8, Y8
|
|
XORQ CX, CX
|
|
XORQ BX, BX
|
|
VPXOR Y7, Y7, Y7
|
|
VPERMQ $0x44, (SI), Y6
|
|
VPERMQ $0x44, 16(SI), Y4
|
|
VMOVDQA b64_dec_lut_hi<>+0(SB), Y5
|
|
|
|
loop:
|
|
VMOVDQU (DX)(BX*1), Y0
|
|
VPSRLD $0x04, Y0, Y2
|
|
VPAND Y8, Y0, Y3
|
|
VPSHUFB Y3, Y4, Y3
|
|
VPAND Y8, Y2, Y2
|
|
VPSHUFB Y2, Y5, Y9
|
|
VPTEST Y9, Y3
|
|
JNE done
|
|
VPCMPEQB Y8, Y0, Y3
|
|
VPADDB Y3, Y2, Y2
|
|
VPSHUFB Y2, Y6, Y2
|
|
VPADDB Y0, Y2, Y0
|
|
VPMADDUBSW b64_dec_madd1<>+0(SB), Y0, Y0
|
|
VPMADDWD b64_dec_madd2<>+0(SB), Y0, Y0
|
|
VEXTRACTI128 $0x01, Y0, X1
|
|
VPSHUFB b64_dec_shuf_lo<>+0(SB), X1, X1
|
|
VPSHUFB b64_dec_shuf<>+0(SB), Y0, Y0
|
|
VPBLENDD $0x08, Y1, Y0, Y1
|
|
VPBLENDD $0xc0, Y7, Y1, Y1
|
|
VMOVDQU Y1, (AX)(CX*1)
|
|
ADDQ $0x18, CX
|
|
ADDQ $0x20, BX
|
|
SUBQ $0x20, DI
|
|
CMPQ DI, $0x2d
|
|
JB done
|
|
JMP loop
|
|
|
|
done:
|
|
MOVQ CX, ret+56(FP)
|
|
MOVQ BX, ret1+64(FP)
|
|
VZEROUPPER
|
|
RET
|
|
|
|
// func decodeAVX2URI(dst []byte, src []byte, lut *int8) (int, int)
|
|
// Requires: AVX, AVX2, SSE4.1
|
|
TEXT ·decodeAVX2URI(SB), NOSPLIT, $0-72
|
|
MOVB $0x2f, AL
|
|
PINSRB $0x00, AX, X0
|
|
VPBROADCASTB X0, Y0
|
|
MOVB $0x5f, AL
|
|
PINSRB $0x00, AX, X1
|
|
VPBROADCASTB X1, Y1
|
|
MOVQ dst_base+0(FP), AX
|
|
MOVQ src_base+24(FP), DX
|
|
MOVQ lut+48(FP), SI
|
|
MOVQ src_len+32(FP), DI
|
|
MOVB $0x2f, CL
|
|
PINSRB $0x00, CX, X10
|
|
VPBROADCASTB X10, Y10
|
|
XORQ CX, CX
|
|
XORQ BX, BX
|
|
VPXOR Y9, Y9, Y9
|
|
VPERMQ $0x44, (SI), Y8
|
|
VPERMQ $0x44, 16(SI), Y6
|
|
VMOVDQA b64_dec_lut_hi<>+0(SB), Y7
|
|
|
|
loop:
|
|
VMOVDQU (DX)(BX*1), Y2
|
|
VPCMPEQB Y2, Y1, Y4
|
|
VPBLENDVB Y4, Y0, Y2, Y2
|
|
VPSRLD $0x04, Y2, Y4
|
|
VPAND Y10, Y2, Y5
|
|
VPSHUFB Y5, Y6, Y5
|
|
VPAND Y10, Y4, Y4
|
|
VPSHUFB Y4, Y7, Y11
|
|
VPTEST Y11, Y5
|
|
JNE done
|
|
VPCMPEQB Y10, Y2, Y5
|
|
VPADDB Y5, Y4, Y4
|
|
VPSHUFB Y4, Y8, Y4
|
|
VPADDB Y2, Y4, Y2
|
|
VPMADDUBSW b64_dec_madd1<>+0(SB), Y2, Y2
|
|
VPMADDWD b64_dec_madd2<>+0(SB), Y2, Y2
|
|
VEXTRACTI128 $0x01, Y2, X3
|
|
VPSHUFB b64_dec_shuf_lo<>+0(SB), X3, X3
|
|
VPSHUFB b64_dec_shuf<>+0(SB), Y2, Y2
|
|
VPBLENDD $0x08, Y3, Y2, Y3
|
|
VPBLENDD $0xc0, Y9, Y3, Y3
|
|
VMOVDQU Y3, (AX)(CX*1)
|
|
ADDQ $0x18, CX
|
|
ADDQ $0x20, BX
|
|
SUBQ $0x20, DI
|
|
CMPQ DI, $0x2d
|
|
JB done
|
|
JMP loop
|
|
|
|
done:
|
|
MOVQ CX, ret+56(FP)
|
|
MOVQ BX, ret1+64(FP)
|
|
VZEROUPPER
|
|
RET
|