305 lines
6.1 KiB
ArmAsm
305 lines
6.1 KiB
ArmAsm
// Code generated by command: go run equal_fold_asm.go -pkg ascii -out ../ascii/equal_fold_amd64.s -stubs ../ascii/equal_fold_amd64.go. DO NOT EDIT.
|
||
|
||
//go:build !purego
|
||
// +build !purego
|
||
|
||
#include "textflag.h"
|
||
|
||
// func EqualFoldString(a string, b string) bool
|
||
// Requires: AVX, AVX2, SSE4.1
|
||
TEXT ·EqualFoldString(SB), NOSPLIT, $0-33
|
||
MOVQ a_base+0(FP), CX
|
||
MOVQ a_len+8(FP), DX
|
||
MOVQ b_base+16(FP), BX
|
||
CMPQ DX, b_len+24(FP)
|
||
JNE done
|
||
XORQ AX, AX
|
||
CMPQ DX, $0x10
|
||
JB init_x86
|
||
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
|
||
JCS init_avx
|
||
|
||
init_x86:
|
||
LEAQ github·com∕segmentio∕asm∕ascii·lowerCase+0(SB), R9
|
||
XORL SI, SI
|
||
|
||
cmp8:
|
||
CMPQ DX, $0x08
|
||
JB cmp7
|
||
MOVBLZX (CX)(AX*1), DI
|
||
MOVBLZX (BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
MOVBLZX 1(CX)(AX*1), DI
|
||
MOVBLZX 1(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
MOVBLZX 2(CX)(AX*1), DI
|
||
MOVBLZX 2(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
MOVBLZX 3(CX)(AX*1), DI
|
||
MOVBLZX 3(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
MOVBLZX 4(CX)(AX*1), DI
|
||
MOVBLZX 4(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
MOVBLZX 5(CX)(AX*1), DI
|
||
MOVBLZX 5(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
MOVBLZX 6(CX)(AX*1), DI
|
||
MOVBLZX 6(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
MOVBLZX 7(CX)(AX*1), DI
|
||
MOVBLZX 7(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
JNE done
|
||
ADDQ $0x08, AX
|
||
SUBQ $0x08, DX
|
||
JMP cmp8
|
||
|
||
cmp7:
|
||
CMPQ DX, $0x07
|
||
JB cmp6
|
||
MOVBLZX 6(CX)(AX*1), DI
|
||
MOVBLZX 6(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
|
||
cmp6:
|
||
CMPQ DX, $0x06
|
||
JB cmp5
|
||
MOVBLZX 5(CX)(AX*1), DI
|
||
MOVBLZX 5(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
|
||
cmp5:
|
||
CMPQ DX, $0x05
|
||
JB cmp4
|
||
MOVBLZX 4(CX)(AX*1), DI
|
||
MOVBLZX 4(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
|
||
cmp4:
|
||
CMPQ DX, $0x04
|
||
JB cmp3
|
||
MOVBLZX 3(CX)(AX*1), DI
|
||
MOVBLZX 3(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
|
||
cmp3:
|
||
CMPQ DX, $0x03
|
||
JB cmp2
|
||
MOVBLZX 2(CX)(AX*1), DI
|
||
MOVBLZX 2(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
|
||
cmp2:
|
||
CMPQ DX, $0x02
|
||
JB cmp1
|
||
MOVBLZX 1(CX)(AX*1), DI
|
||
MOVBLZX 1(BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
|
||
cmp1:
|
||
CMPQ DX, $0x01
|
||
JB success
|
||
MOVBLZX (CX)(AX*1), DI
|
||
MOVBLZX (BX)(AX*1), R8
|
||
MOVB (R9)(DI*1), DI
|
||
XORB (R9)(R8*1), DI
|
||
ORB DI, SI
|
||
|
||
done:
|
||
SETEQ ret+32(FP)
|
||
RET
|
||
|
||
success:
|
||
MOVB $0x01, ret+32(FP)
|
||
RET
|
||
|
||
init_avx:
|
||
MOVB $0x20, SI
|
||
PINSRB $0x00, SI, X12
|
||
VPBROADCASTB X12, Y12
|
||
MOVB $0x1f, SI
|
||
PINSRB $0x00, SI, X13
|
||
VPBROADCASTB X13, Y13
|
||
MOVB $0x9a, SI
|
||
PINSRB $0x00, SI, X14
|
||
VPBROADCASTB X14, Y14
|
||
MOVB $0x01, SI
|
||
PINSRB $0x00, SI, X15
|
||
VPBROADCASTB X15, Y15
|
||
|
||
cmp128:
|
||
CMPQ DX, $0x80
|
||
JB cmp64
|
||
VMOVDQU (CX)(AX*1), Y0
|
||
VMOVDQU 32(CX)(AX*1), Y1
|
||
VMOVDQU 64(CX)(AX*1), Y2
|
||
VMOVDQU 96(CX)(AX*1), Y3
|
||
VMOVDQU (BX)(AX*1), Y4
|
||
VMOVDQU 32(BX)(AX*1), Y5
|
||
VMOVDQU 64(BX)(AX*1), Y6
|
||
VMOVDQU 96(BX)(AX*1), Y7
|
||
VXORPD Y0, Y4, Y4
|
||
VPCMPEQB Y12, Y4, Y8
|
||
VORPD Y12, Y0, Y0
|
||
VPADDB Y13, Y0, Y0
|
||
VPCMPGTB Y0, Y14, Y0
|
||
VPAND Y8, Y0, Y0
|
||
VPAND Y15, Y0, Y0
|
||
VPSLLW $0x05, Y0, Y0
|
||
VPCMPEQB Y4, Y0, Y0
|
||
VXORPD Y1, Y5, Y5
|
||
VPCMPEQB Y12, Y5, Y9
|
||
VORPD Y12, Y1, Y1
|
||
VPADDB Y13, Y1, Y1
|
||
VPCMPGTB Y1, Y14, Y1
|
||
VPAND Y9, Y1, Y1
|
||
VPAND Y15, Y1, Y1
|
||
VPSLLW $0x05, Y1, Y1
|
||
VPCMPEQB Y5, Y1, Y1
|
||
VXORPD Y2, Y6, Y6
|
||
VPCMPEQB Y12, Y6, Y10
|
||
VORPD Y12, Y2, Y2
|
||
VPADDB Y13, Y2, Y2
|
||
VPCMPGTB Y2, Y14, Y2
|
||
VPAND Y10, Y2, Y2
|
||
VPAND Y15, Y2, Y2
|
||
VPSLLW $0x05, Y2, Y2
|
||
VPCMPEQB Y6, Y2, Y2
|
||
VXORPD Y3, Y7, Y7
|
||
VPCMPEQB Y12, Y7, Y11
|
||
VORPD Y12, Y3, Y3
|
||
VPADDB Y13, Y3, Y3
|
||
VPCMPGTB Y3, Y14, Y3
|
||
VPAND Y11, Y3, Y3
|
||
VPAND Y15, Y3, Y3
|
||
VPSLLW $0x05, Y3, Y3
|
||
VPCMPEQB Y7, Y3, Y3
|
||
VPAND Y1, Y0, Y0
|
||
VPAND Y3, Y2, Y2
|
||
VPAND Y2, Y0, Y0
|
||
ADDQ $0x80, AX
|
||
SUBQ $0x80, DX
|
||
VPMOVMSKB Y0, SI
|
||
XORL $0xffffffff, SI
|
||
JNE done
|
||
JMP cmp128
|
||
|
||
cmp64:
|
||
CMPQ DX, $0x40
|
||
JB cmp32
|
||
VMOVDQU (CX)(AX*1), Y0
|
||
VMOVDQU 32(CX)(AX*1), Y1
|
||
VMOVDQU (BX)(AX*1), Y2
|
||
VMOVDQU 32(BX)(AX*1), Y3
|
||
VXORPD Y0, Y2, Y2
|
||
VPCMPEQB Y12, Y2, Y4
|
||
VORPD Y12, Y0, Y0
|
||
VPADDB Y13, Y0, Y0
|
||
VPCMPGTB Y0, Y14, Y0
|
||
VPAND Y4, Y0, Y0
|
||
VPAND Y15, Y0, Y0
|
||
VPSLLW $0x05, Y0, Y0
|
||
VPCMPEQB Y2, Y0, Y0
|
||
VXORPD Y1, Y3, Y3
|
||
VPCMPEQB Y12, Y3, Y5
|
||
VORPD Y12, Y1, Y1
|
||
VPADDB Y13, Y1, Y1
|
||
VPCMPGTB Y1, Y14, Y1
|
||
VPAND Y5, Y1, Y1
|
||
VPAND Y15, Y1, Y1
|
||
VPSLLW $0x05, Y1, Y1
|
||
VPCMPEQB Y3, Y1, Y1
|
||
VPAND Y1, Y0, Y0
|
||
ADDQ $0x40, AX
|
||
SUBQ $0x40, DX
|
||
VPMOVMSKB Y0, SI
|
||
XORL $0xffffffff, SI
|
||
JNE done
|
||
|
||
cmp32:
|
||
CMPQ DX, $0x20
|
||
JB cmp16
|
||
VMOVDQU (CX)(AX*1), Y0
|
||
VMOVDQU (BX)(AX*1), Y1
|
||
VXORPD Y0, Y1, Y1
|
||
VPCMPEQB Y12, Y1, Y2
|
||
VORPD Y12, Y0, Y0
|
||
VPADDB Y13, Y0, Y0
|
||
VPCMPGTB Y0, Y14, Y0
|
||
VPAND Y2, Y0, Y0
|
||
VPAND Y15, Y0, Y0
|
||
VPSLLW $0x05, Y0, Y0
|
||
VPCMPEQB Y1, Y0, Y0
|
||
ADDQ $0x20, AX
|
||
SUBQ $0x20, DX
|
||
VPMOVMSKB Y0, SI
|
||
XORL $0xffffffff, SI
|
||
JNE done
|
||
|
||
cmp16:
|
||
CMPQ DX, $0x10
|
||
JLE cmp_tail
|
||
VMOVDQU (CX)(AX*1), X0
|
||
VMOVDQU (BX)(AX*1), X1
|
||
VXORPD X0, X1, X1
|
||
VPCMPEQB X12, X1, X2
|
||
VORPD X12, X0, X0
|
||
VPADDB X13, X0, X0
|
||
VPCMPGTB X0, X14, X0
|
||
VPAND X2, X0, X0
|
||
VPAND X15, X0, X0
|
||
VPSLLW $0x05, X0, X0
|
||
VPCMPEQB X1, X0, X0
|
||
ADDQ $0x10, AX
|
||
SUBQ $0x10, DX
|
||
VPMOVMSKB X0, SI
|
||
XORL $0x0000ffff, SI
|
||
JNE done
|
||
|
||
cmp_tail:
|
||
SUBQ $0x10, DX
|
||
ADDQ DX, AX
|
||
VMOVDQU (CX)(AX*1), X0
|
||
VMOVDQU (BX)(AX*1), X1
|
||
VXORPD X0, X1, X1
|
||
VPCMPEQB X12, X1, X2
|
||
VORPD X12, X0, X0
|
||
VPADDB X13, X0, X0
|
||
VPCMPGTB X0, X14, X0
|
||
VPAND X2, X0, X0
|
||
VPAND X15, X0, X0
|
||
VPSLLW $0x05, X0, X0
|
||
VPCMPEQB X1, X0, X0
|
||
VPMOVMSKB X0, AX
|
||
XORL $0x0000ffff, AX
|
||
JMP done
|