status-go/vendor/github.com/segmentio/asm/keyset/keyset_arm64.s

//go:build !purego
// +build !purego

#include "textflag.h"

// func Lookup(keyset []byte, key []byte) int
TEXT ·Lookup(SB), NOSPLIT, $0-56
	MOVD keyset+0(FP), R0
	MOVD keyset_len+8(FP), R1
	MOVD key+24(FP), R2
	MOVD key_len+32(FP), R3
	MOVD key_cap+40(FP), R4

	// None of the keys in the set are greater than 16 bytes, so if the input
	// key is we can jump straight to not found.
	CMP $16, R3
	BHI notfound

	// We'll be moving the keyset pointer (R0) forward as we compare keys, so
	// make a copy of the starting point (R6). Also add the byte length (R1) to
	// obtain a pointer to the end of the keyset (R5).
	MOVD R0, R6
	ADD  R0, R1, R5

	// Prepare a 64-bit mask of all ones.
	MOVD $-1, R7

	// Prepare a vector of all zeroes.
	VMOV ZR, V1.B16

	// Check that it's safe to load 16 bytes of input. If cap(input)<16, jump
	// to a check that determines whether a tail load is necessary (to avoid a
	// page fault).
	CMP $16, R4
	BLO safeload

load:
	// Load the input key (V0) and pad with zero bytes (V1). To blend the two
	// vectors, we load a mask for the particular key length and then use TBL
	// to select bytes from either V0 or V1.
	VLD1 (R2), [V0.B16]
	MOVD $blend_masks<>(SB), R10
	ADD  R3<<4, R10, R10
	VLD1 (R10), [V2.B16]
	VTBL V2.B16, [V0.B16, V1.B16], V3.B16

loop:
	// Loop through each 16 byte key in the keyset.
	CMP R0, R5
	BEQ notfound

	// Load and compare the next key.
	VLD1.P 16(R0), [V4.B16]
	VCMEQ  V3.B16, V4.B16, V5.B16
	VMOV   V5.D[0], R8
	VMOV   V5.D[1], R9
	AND    R8, R9, R9

	// If the masks match, we found the key.
	CMP R9, R7
	BEQ found
	JMP loop

found:
	// If the key was found, take the position in the keyset and convert it
	// to an index. The keyset pointer (R0) will be 1 key past the match, so
	// subtract the starting pointer (R6), divide by 16 to convert from byte
	// length to an index, and then subtract one.
	SUB  R6, R0, R0
	ADD  R0>>4, ZR, R0
	SUB  $1, R0, R0
	MOVD R0, ret+48(FP)
	RET

notfound:
	// Return the number of keys in the keyset, which is the byte length (R1)
	// divided by 16.
	ADD  R1>>4, ZR, R1
	MOVD R1, ret+48(FP)
	RET

safeload:
	// Check if the input crosses a page boundary. If not, jump back.
	AND $4095, R2, R12
	CMP $4080, R12
	BLS load

	// If it does cross a page boundary, we must assume that loading 16 bytes
	// will cause a fault. Instead, we load the 16 bytes up to and including the
	// key and then shuffle the key forward in the register. We can shuffle and
	// pad with zeroes at the same time to avoid having to also blend (as load
	// does).
	MOVD $16, R12
	SUB  R3, R12, R12
	SUB  R12, R2, R2
	VLD1 (R2), [V0.B16]
	MOVD $shuffle_masks<>(SB), R10
	ADD  R12, R10, R10
	VLD1 (R10), [V2.B16]
	VTBL V2.B16, [V0.B16, V1.B16], V3.B16
	JMP  loop

DATA blend_masks<>+0(SB)/8, $0x1010101010101010
DATA blend_masks<>+8(SB)/8, $0x1010101010101010
DATA blend_masks<>+16(SB)/8, $0x1010101010101000
DATA blend_masks<>+24(SB)/8, $0x1010101010101010
DATA blend_masks<>+32(SB)/8, $0x1010101010100100
DATA blend_masks<>+40(SB)/8, $0x1010101010101010
DATA blend_masks<>+48(SB)/8, $0x1010101010020100
DATA blend_masks<>+56(SB)/8, $0x1010101010101010
DATA blend_masks<>+64(SB)/8, $0x1010101003020100
DATA blend_masks<>+72(SB)/8, $0x1010101010101010
DATA blend_masks<>+80(SB)/8, $0x1010100403020100
DATA blend_masks<>+88(SB)/8, $0x1010101010101010
DATA blend_masks<>+96(SB)/8, $0x1010050403020100
DATA blend_masks<>+104(SB)/8, $0x1010101010101010
DATA blend_masks<>+112(SB)/8, $0x1006050403020100
DATA blend_masks<>+120(SB)/8, $0x1010101010101010
DATA blend_masks<>+128(SB)/8, $0x0706050403020100
DATA blend_masks<>+136(SB)/8, $0x1010101010101010
DATA blend_masks<>+144(SB)/8, $0x0706050403020100
DATA blend_masks<>+152(SB)/8, $0x1010101010101008
DATA blend_masks<>+160(SB)/8, $0x0706050403020100
DATA blend_masks<>+168(SB)/8, $0x1010101010100908
DATA blend_masks<>+176(SB)/8, $0x0706050403020100
DATA blend_masks<>+184(SB)/8, $0x10101010100A0908
DATA blend_masks<>+192(SB)/8, $0x0706050403020100
DATA blend_masks<>+200(SB)/8, $0x101010100B0A0908
DATA blend_masks<>+208(SB)/8, $0x0706050403020100
DATA blend_masks<>+216(SB)/8, $0x1010100C0B0A0908
DATA blend_masks<>+224(SB)/8, $0x0706050403020100
DATA blend_masks<>+232(SB)/8, $0x10100D0C0B0A0908
DATA blend_masks<>+240(SB)/8, $0x0706050403020100
DATA blend_masks<>+248(SB)/8, $0x100E0D0C0B0A0908
DATA blend_masks<>+256(SB)/8, $0x0706050403020100
DATA blend_masks<>+264(SB)/8, $0x0F0E0D0C0B0A0908
GLOBL blend_masks<>(SB), RODATA|NOPTR, $272

DATA shuffle_masks<>+0(SB)/8, $0x0706050403020100
DATA shuffle_masks<>+8(SB)/8, $0x0F0E0D0C0B0A0908
DATA shuffle_masks<>+16(SB)/8, $0x1010101010101010
DATA shuffle_masks<>+24(SB)/8, $0x1010101010101010
GLOBL shuffle_masks<>(SB), RODATA|NOPTR, $32