144 lines
4.6 KiB
ArmAsm
144 lines
4.6 KiB
ArmAsm
//go:build !purego
|
|
// +build !purego
|
|
|
|
#include "textflag.h"
|
|
|
|
// func Lookup(keyset []byte, key []byte) int
|
|
TEXT ·Lookup(SB), NOSPLIT, $0-56
|
|
MOVD keyset+0(FP), R0
|
|
MOVD keyset_len+8(FP), R1
|
|
MOVD key+24(FP), R2
|
|
MOVD key_len+32(FP), R3
|
|
MOVD key_cap+40(FP), R4
|
|
|
|
// None of the keys in the set are greater than 16 bytes, so if the input
|
|
// key is we can jump straight to not found.
|
|
CMP $16, R3
|
|
BHI notfound
|
|
|
|
// We'll be moving the keyset pointer (R0) forward as we compare keys, so
|
|
// make a copy of the starting point (R6). Also add the byte length (R1) to
|
|
// obtain a pointer to the end of the keyset (R5).
|
|
MOVD R0, R6
|
|
ADD R0, R1, R5
|
|
|
|
// Prepare a 64-bit mask of all ones.
|
|
MOVD $-1, R7
|
|
|
|
// Prepare a vector of all zeroes.
|
|
VMOV ZR, V1.B16
|
|
|
|
// Check that it's safe to load 16 bytes of input. If cap(input)<16, jump
|
|
// to a check that determines whether a tail load is necessary (to avoid a
|
|
// page fault).
|
|
CMP $16, R4
|
|
BLO safeload
|
|
|
|
load:
|
|
// Load the input key (V0) and pad with zero bytes (V1). To blend the two
|
|
// vectors, we load a mask for the particular key length and then use TBL
|
|
// to select bytes from either V0 or V1.
|
|
VLD1 (R2), [V0.B16]
|
|
MOVD $blend_masks<>(SB), R10
|
|
ADD R3<<4, R10, R10
|
|
VLD1 (R10), [V2.B16]
|
|
VTBL V2.B16, [V0.B16, V1.B16], V3.B16
|
|
|
|
loop:
|
|
// Loop through each 16 byte key in the keyset.
|
|
CMP R0, R5
|
|
BEQ notfound
|
|
|
|
// Load and compare the next key.
|
|
VLD1.P 16(R0), [V4.B16]
|
|
VCMEQ V3.B16, V4.B16, V5.B16
|
|
VMOV V5.D[0], R8
|
|
VMOV V5.D[1], R9
|
|
AND R8, R9, R9
|
|
|
|
// If the masks match, we found the key.
|
|
CMP R9, R7
|
|
BEQ found
|
|
JMP loop
|
|
|
|
found:
|
|
// If the key was found, take the position in the keyset and convert it
|
|
// to an index. The keyset pointer (R0) will be 1 key past the match, so
|
|
// subtract the starting pointer (R6), divide by 16 to convert from byte
|
|
// length to an index, and then subtract one.
|
|
SUB R6, R0, R0
|
|
ADD R0>>4, ZR, R0
|
|
SUB $1, R0, R0
|
|
MOVD R0, ret+48(FP)
|
|
RET
|
|
|
|
notfound:
|
|
// Return the number of keys in the keyset, which is the byte length (R1)
|
|
// divided by 16.
|
|
ADD R1>>4, ZR, R1
|
|
MOVD R1, ret+48(FP)
|
|
RET
|
|
|
|
safeload:
|
|
// Check if the input crosses a page boundary. If not, jump back.
|
|
AND $4095, R2, R12
|
|
CMP $4080, R12
|
|
BLS load
|
|
|
|
// If it does cross a page boundary, we must assume that loading 16 bytes
|
|
// will cause a fault. Instead, we load the 16 bytes up to and including the
|
|
// key and then shuffle the key forward in the register. We can shuffle and
|
|
// pad with zeroes at the same time to avoid having to also blend (as load
|
|
// does).
|
|
MOVD $16, R12
|
|
SUB R3, R12, R12
|
|
SUB R12, R2, R2
|
|
VLD1 (R2), [V0.B16]
|
|
MOVD $shuffle_masks<>(SB), R10
|
|
ADD R12, R10, R10
|
|
VLD1 (R10), [V2.B16]
|
|
VTBL V2.B16, [V0.B16, V1.B16], V3.B16
|
|
JMP loop
|
|
|
|
DATA blend_masks<>+0(SB)/8, $0x1010101010101010
|
|
DATA blend_masks<>+8(SB)/8, $0x1010101010101010
|
|
DATA blend_masks<>+16(SB)/8, $0x1010101010101000
|
|
DATA blend_masks<>+24(SB)/8, $0x1010101010101010
|
|
DATA blend_masks<>+32(SB)/8, $0x1010101010100100
|
|
DATA blend_masks<>+40(SB)/8, $0x1010101010101010
|
|
DATA blend_masks<>+48(SB)/8, $0x1010101010020100
|
|
DATA blend_masks<>+56(SB)/8, $0x1010101010101010
|
|
DATA blend_masks<>+64(SB)/8, $0x1010101003020100
|
|
DATA blend_masks<>+72(SB)/8, $0x1010101010101010
|
|
DATA blend_masks<>+80(SB)/8, $0x1010100403020100
|
|
DATA blend_masks<>+88(SB)/8, $0x1010101010101010
|
|
DATA blend_masks<>+96(SB)/8, $0x1010050403020100
|
|
DATA blend_masks<>+104(SB)/8, $0x1010101010101010
|
|
DATA blend_masks<>+112(SB)/8, $0x1006050403020100
|
|
DATA blend_masks<>+120(SB)/8, $0x1010101010101010
|
|
DATA blend_masks<>+128(SB)/8, $0x0706050403020100
|
|
DATA blend_masks<>+136(SB)/8, $0x1010101010101010
|
|
DATA blend_masks<>+144(SB)/8, $0x0706050403020100
|
|
DATA blend_masks<>+152(SB)/8, $0x1010101010101008
|
|
DATA blend_masks<>+160(SB)/8, $0x0706050403020100
|
|
DATA blend_masks<>+168(SB)/8, $0x1010101010100908
|
|
DATA blend_masks<>+176(SB)/8, $0x0706050403020100
|
|
DATA blend_masks<>+184(SB)/8, $0x10101010100A0908
|
|
DATA blend_masks<>+192(SB)/8, $0x0706050403020100
|
|
DATA blend_masks<>+200(SB)/8, $0x101010100B0A0908
|
|
DATA blend_masks<>+208(SB)/8, $0x0706050403020100
|
|
DATA blend_masks<>+216(SB)/8, $0x1010100C0B0A0908
|
|
DATA blend_masks<>+224(SB)/8, $0x0706050403020100
|
|
DATA blend_masks<>+232(SB)/8, $0x10100D0C0B0A0908
|
|
DATA blend_masks<>+240(SB)/8, $0x0706050403020100
|
|
DATA blend_masks<>+248(SB)/8, $0x100E0D0C0B0A0908
|
|
DATA blend_masks<>+256(SB)/8, $0x0706050403020100
|
|
DATA blend_masks<>+264(SB)/8, $0x0F0E0D0C0B0A0908
|
|
GLOBL blend_masks<>(SB), RODATA|NOPTR, $272
|
|
|
|
DATA shuffle_masks<>+0(SB)/8, $0x0706050403020100
|
|
DATA shuffle_masks<>+8(SB)/8, $0x0F0E0D0C0B0A0908
|
|
DATA shuffle_masks<>+16(SB)/8, $0x1010101010101010
|
|
DATA shuffle_masks<>+24(SB)/8, $0x1010101010101010
|
|
GLOBL shuffle_masks<>(SB), RODATA|NOPTR, $32
|