status-go/vendor/github.com/steakknife/hamming/popcount_slices_amd64.s

371 lines
7.2 KiB
ArmAsm

//
// hamming distance calculations in Go
//
// https://github.com/steakknife/hamming
//
// Copyright © 2014, 2015, 2016 Barry Allard
//
// MIT license
//
#include "textflag.h"
// type SliceHeader struct {
// Data uintptr 0
// Len int 8
// Cap int 16
// }
// 0 x.Data
// 8 x.Len
// 16 x.Cap
// 24 ret
// type StringHeader struct {
// Data uintptr 0
// Len int 8
// }
// 0 x.Data
// 8 x.Len
// 16 ret
// func CountBitsInt8sPopCnt(x []int8) (ret int)
TEXT ·CountBitsInt8sPopCnt(SB),NOSPLIT,$0
JMP ·CountBitsUint8sPopCnt(SB)
// func CountBitsInt16sPopCnt(x []int16) (ret int)
TEXT ·CountBitsInt16sPopCnt(SB),NOSPLIT,$0
JMP ·CountBitsUint16sPopCnt(SB)
// func CountBitsInt32sPopCnt(x []int32) (ret int)
TEXT ·CountBitsInt32sPopCnt(SB),NOSPLIT,$0
JMP ·CountBitsUint32sPopCnt(SB)
// func CountBitsInt64sPopCnt(x []int64) (ret int)
TEXT ·CountBitsInt64sPopCnt(SB),NOSPLIT,$0
JMP ·CountBitsUint64sPopCnt(SB)
// func CountBitsUint8sPopCnt(x []uint8) (ret int)
TEXT ·CountBitsUint8sPopCnt(SB),NOSPLIT,$0
XORQ AX, AX // ret = 0
MOVQ x+8(FP), CX // x.Len -> CX
test_negative_slice_len:
MOVQ CX, BX // x.Len < 0 ---> x.Len[63] != 0
SHRQ $63, BX
JNZ done
MOVQ x+0(FP), DI // x.Data -> DI
CMPQ CX, $32 // x.Len >= 32
JL unrolled_loop_skip
unrolled_loop_setup:
XORQ R9, R9
XORQ BX, BX
XORQ DX, DX
unrolled_loop: // 4 unrolled loops of POPCNTQ (4 quad words at a time)
SUBQ $32, CX
POPCNTQ 0(DI), R10
ADDQ R10, R9
POPCNTQ 8(DI), R11
ADDQ R11, AX
POPCNTQ 16(DI), R12
ADDQ R12, BX
POPCNTQ 24(DI), R13
ADDQ R13, DX
ADDQ $32, DI
CMPQ CX, $32 // x.Len >= 32
JGE unrolled_loop
unrolled_loop_done:
ADDQ R9, AX
ADDQ BX, DX
ADDQ DX, AX
XORQ BX, BX
unrolled_loop_skip:
CMPQ CX, $0
JZ done
XORQ DX, DX
remainder_loop:
MOVB 0(DI), DL
POPCNTQ DX, BX
ADDQ BX, AX
INCQ DI
DECQ CX
JNZ remainder_loop
done:
MOVQ AX, ret+24(FP)
RET
// func CountBitsUint16sPopCnt(x []uint16) (ret int)
TEXT ·CountBitsUint16sPopCnt(SB),NOSPLIT,$0
XORQ AX, AX // ret = 0
MOVQ x+8(FP), CX // x.Len -> CX
test_negative_slice_len:
MOVQ CX, BX // x.Len*2 < 0 ---> x.Len[63:62] != 0
SHLQ $1, CX
SHRQ $62, BX
JNZ done
MOVQ x+0(FP), DI // x.Data -> DI
CMPQ CX, $32 // x.Len*2 >= 32
JL unrolled_loop_skip
unrolled_loop_setup:
XORQ R9, R9
XORQ BX, BX
XORQ DX, DX
unrolled_loop: // 4 unrolled loops of POPCNTQ (4 quad words at a time)
SUBQ $32, CX
POPCNTQ 0(DI), R10
ADDQ R10, R9
POPCNTQ 8(DI), R11
ADDQ R11, AX
POPCNTQ 16(DI), R12
ADDQ R12, BX
POPCNTQ 24(DI), R13
ADDQ R13, DX
ADDQ $32, DI
CMPQ CX, $32 // x.Len*2 >= 32
JGE unrolled_loop
unrolled_loop_done:
ADDQ R9, AX
ADDQ BX, DX
ADDQ DX, AX
XORQ BX, BX
unrolled_loop_skip:
CMPQ CX, $0
JZ done
XORQ DX, DX
remainder_loop:
MOVW 0(DI), DX
POPCNTQ DX, BX
ADDQ BX, AX
ADDQ $2, DI
SUBQ $2, CX
JNZ remainder_loop
done:
MOVQ AX, ret+24(FP)
RET
// func CountBitsUint32sPopCnt(x []uint32) (ret int)
TEXT ·CountBitsUint32sPopCnt(SB),NOSPLIT,$0
XORQ AX, AX // ret = 0
MOVQ x+8(FP), CX // x.Len -> CX
MOVQ CX, BX
MOVQ x+0(FP), DI // x.Data -> DI
test_negative_slice_len:
SHLQ $2, CX // x.Len*4 < 0 ---> x.Len[63:61] != 0
SHRQ $61, BX
JNZ done
CMPQ CX, $32 // x.Len*4 >= 32
JL unrolled_loop_skip
unrolled_loop_setup:
XORQ R9, R9
XORQ BX, BX
XORQ DX, DX
unrolled_loop: // 4 unrolled loops of POPCNTQ (4 quad words at a time)
SUBQ $32, CX
POPCNTQ 0(DI), R10 // r9 += popcntq(QW DI+0)
ADDQ R10, R9
POPCNTQ 8(DI), R11 // ax += popcntq(QW DI+8)
ADDQ R11, AX
POPCNTQ 16(DI), R12 // bx += popcntq(QW DI+16)
ADDQ R12, BX
POPCNTQ 24(DI), R13 // dx += popcntq(QW DI+24)
ADDQ R13, DX
ADDQ $32, DI
CMPQ CX, $32 // x.Len*4 >= 32
JGE unrolled_loop
unrolled_loop_done:
ADDQ R9, AX // ax = (ax + r9) + (bx + dx)
ADDQ BX, DX
ADDQ DX, AX
XORQ BX, BX
unrolled_loop_skip:
CMPQ CX, $0
JZ done
XORQ DX, DX
remainder_loop:
MOVB (DI), DX // ax += popcnt(DB 0(DI))
POPCNTQ DX, BX
ADDQ BX, AX
INCQ DI
DECQ CX
JNZ remainder_loop
done:
MOVQ AX, ret+24(FP)
RET
// func CountBitsUint64sPopCnt(x []uint64) (ret int)
TEXT ·CountBitsUint64sPopCnt(SB),NOSPLIT,$0
XORQ AX, AX // ret = 0
MOVQ x+8(FP), CX // x.Len -> CX
test_negative_slice_len:
MOVQ CX, BX // x.Len*8 < 0 ---> x.Len[63:60] != 0
SHLQ $3, CX
SHRQ $60, BX
JNZ done
MOVQ x+0(FP), DI // x.Data -> DI
CMPQ CX, $32 // x.Len*8 >= 32
JL unrolled_loop_skip
unrolled_loop_setup:
XORQ R9, R9
XORQ BX, BX
XORQ DX, DX
unrolled_loop: // 4 unrolled loops of POPCNTQ (4 quad words at a time)
SUBQ $32, CX
POPCNTQ 0(DI), R10
ADDQ R10, R9
POPCNTQ 8(DI), R11
ADDQ R11, AX
POPCNTQ 16(DI), R12
ADDQ R12, BX
POPCNTQ 24(DI), R13
ADDQ R13, DX
ADDQ $32, DI
CMPQ CX, $32 // x.Len*4 >= 32
JGE unrolled_loop
unrolled_loop_done:
ADDQ R9, AX
ADDQ BX, DX
ADDQ DX, AX
XORQ BX, BX
unrolled_loop_skip:
CMPQ CX, $0
JZ done
XORQ DX, DX
remainder_loop:
MOVQ 0(DI), DX
POPCNTQ DX, BX
ADDQ BX, AX
ADDQ $8, DI
SUBQ $8, CX
JNZ remainder_loop
done:
MOVQ AX, ret+24(FP)
RET
// func CountBitsBytesPopCnt(x []byte) (ret int)
TEXT ·CountBitsBytesPopCnt(SB),NOSPLIT,$0
JMP ·CountBitsUint8sPopCnt(SB)
// func CountBitsRunesPopCnt(x []rune) (ret int)
TEXT ·CountBitsRunesPopCnt(SB),NOSPLIT,$0
JMP ·CountBitsUint32sPopCnt(SB)
// func CountBitsStringPopCnt(s string) (ret int)
TEXT ·CountBitsStringPopCnt(SB),NOSPLIT,$0
XORQ AX, AX // ret = 0
MOVQ x+8(FP), CX // x.Len -> CX
test_negative_slice_len:
MOVQ CX, BX // x.Len < 0 ---> x.Len[63] != 0
SHRQ $63, BX
JNZ done
MOVQ x+0(FP), DI // x.Data -> DI
CMPQ CX, $32 // x.Len >= 32
JL unrolled_loop_skip
unrolled_loop_setup:
XORQ R9, R9
XORQ BX, BX
XORQ DX, DX
unrolled_loop: // 4 unrolled loops of POPCNTQ (4 quad words at a time)
SUBQ $32, CX
POPCNTQ 0(DI), R10
ADDQ R10, R9
POPCNTQ 8(DI), R11
ADDQ R11, AX
POPCNTQ 16(DI), R12
ADDQ R12, BX
POPCNTQ 24(DI), R13
ADDQ R13, DX
ADDQ $32, DI
CMPQ CX, $32 // x.Len >= 32
JGE unrolled_loop
unrolled_loop_done:
ADDQ R9, AX
ADDQ BX, DX
ADDQ DX, AX
XORQ BX, BX
unrolled_loop_skip:
CMPQ CX, $0
JZ done
XORQ DX, DX
remainder_loop:
MOVB 0(DI), DL
POPCNTQ DX, BX
ADDQ BX, AX
INCQ DI
DECQ CX
JNZ remainder_loop
done:
MOVQ AX, ret+16(FP)
RET