311 lines
8.6 KiB
ArmAsm
311 lines
8.6 KiB
ArmAsm
|
//+build !noasm
|
||
|
//+build !appengine
|
||
|
//+build !gccgo
|
||
|
//+build !nopshufb
|
||
|
|
||
|
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||
|
|
||
|
// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
|
||
|
// and http://jerasure.org/jerasure/gf-complete/tree/master
|
||
|
|
||
|
// func galMulSSSE3Xor(low, high, in, out []byte)
|
||
|
TEXT ·galMulSSSE3Xor(SB), 7, $0
|
||
|
MOVQ low+0(FP), SI // SI: &low
|
||
|
MOVQ high+24(FP), DX // DX: &high
|
||
|
MOVOU (SI), X6 // X6 low
|
||
|
MOVOU (DX), X7 // X7: high
|
||
|
MOVQ $15, BX // BX: low mask
|
||
|
MOVQ BX, X8
|
||
|
PXOR X5, X5
|
||
|
MOVQ in+48(FP), SI // R11: &in
|
||
|
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||
|
MOVQ out+72(FP), DX // DX: &out
|
||
|
PSHUFB X5, X8 // X8: lomask (unpacked)
|
||
|
SHRQ $4, R9 // len(in) / 16
|
||
|
MOVQ SI, AX
|
||
|
MOVQ DX, BX
|
||
|
ANDQ $15, AX
|
||
|
ANDQ $15, BX
|
||
|
CMPQ R9, $0
|
||
|
JEQ done_xor
|
||
|
ORQ AX, BX
|
||
|
CMPQ BX, $0
|
||
|
JNZ loopback_xor
|
||
|
|
||
|
loopback_xor_aligned:
|
||
|
MOVOA (SI), X0 // in[x]
|
||
|
MOVOA (DX), X4 // out[x]
|
||
|
MOVOA X0, X1 // in[x]
|
||
|
MOVOA X6, X2 // low copy
|
||
|
MOVOA X7, X3 // high copy
|
||
|
PSRLQ $4, X1 // X1: high input
|
||
|
PAND X8, X0 // X0: low input
|
||
|
PAND X8, X1 // X0: high input
|
||
|
PSHUFB X0, X2 // X2: mul low part
|
||
|
PSHUFB X1, X3 // X3: mul high part
|
||
|
PXOR X2, X3 // X3: Result
|
||
|
PXOR X4, X3 // X3: Result xor existing out
|
||
|
MOVOA X3, (DX) // Store
|
||
|
ADDQ $16, SI // in+=16
|
||
|
ADDQ $16, DX // out+=16
|
||
|
SUBQ $1, R9
|
||
|
JNZ loopback_xor_aligned
|
||
|
JMP done_xor
|
||
|
|
||
|
loopback_xor:
|
||
|
MOVOU (SI), X0 // in[x]
|
||
|
MOVOU (DX), X4 // out[x]
|
||
|
MOVOU X0, X1 // in[x]
|
||
|
MOVOU X6, X2 // low copy
|
||
|
MOVOU X7, X3 // high copy
|
||
|
PSRLQ $4, X1 // X1: high input
|
||
|
PAND X8, X0 // X0: low input
|
||
|
PAND X8, X1 // X0: high input
|
||
|
PSHUFB X0, X2 // X2: mul low part
|
||
|
PSHUFB X1, X3 // X3: mul high part
|
||
|
PXOR X2, X3 // X3: Result
|
||
|
PXOR X4, X3 // X3: Result xor existing out
|
||
|
MOVOU X3, (DX) // Store
|
||
|
ADDQ $16, SI // in+=16
|
||
|
ADDQ $16, DX // out+=16
|
||
|
SUBQ $1, R9
|
||
|
JNZ loopback_xor
|
||
|
|
||
|
done_xor:
|
||
|
RET
|
||
|
|
||
|
// func galMulSSSE3(low, high, in, out []byte)
|
||
|
TEXT ·galMulSSSE3(SB), 7, $0
|
||
|
MOVQ low+0(FP), SI // SI: &low
|
||
|
MOVQ high+24(FP), DX // DX: &high
|
||
|
MOVOU (SI), X6 // X6 low
|
||
|
MOVOU (DX), X7 // X7: high
|
||
|
MOVQ $15, BX // BX: low mask
|
||
|
MOVQ BX, X8
|
||
|
PXOR X5, X5
|
||
|
MOVQ in+48(FP), SI // R11: &in
|
||
|
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||
|
MOVQ out+72(FP), DX // DX: &out
|
||
|
PSHUFB X5, X8 // X8: lomask (unpacked)
|
||
|
MOVQ SI, AX
|
||
|
MOVQ DX, BX
|
||
|
SHRQ $4, R9 // len(in) / 16
|
||
|
ANDQ $15, AX
|
||
|
ANDQ $15, BX
|
||
|
CMPQ R9, $0
|
||
|
JEQ done
|
||
|
ORQ AX, BX
|
||
|
CMPQ BX, $0
|
||
|
JNZ loopback
|
||
|
|
||
|
loopback_aligned:
|
||
|
MOVOA (SI), X0 // in[x]
|
||
|
MOVOA X0, X1 // in[x]
|
||
|
MOVOA X6, X2 // low copy
|
||
|
MOVOA X7, X3 // high copy
|
||
|
PSRLQ $4, X1 // X1: high input
|
||
|
PAND X8, X0 // X0: low input
|
||
|
PAND X8, X1 // X0: high input
|
||
|
PSHUFB X0, X2 // X2: mul low part
|
||
|
PSHUFB X1, X3 // X3: mul high part
|
||
|
PXOR X2, X3 // X3: Result
|
||
|
MOVOA X3, (DX) // Store
|
||
|
ADDQ $16, SI // in+=16
|
||
|
ADDQ $16, DX // out+=16
|
||
|
SUBQ $1, R9
|
||
|
JNZ loopback_aligned
|
||
|
JMP done
|
||
|
|
||
|
loopback:
|
||
|
MOVOU (SI), X0 // in[x]
|
||
|
MOVOU X0, X1 // in[x]
|
||
|
MOVOA X6, X2 // low copy
|
||
|
MOVOA X7, X3 // high copy
|
||
|
PSRLQ $4, X1 // X1: high input
|
||
|
PAND X8, X0 // X0: low input
|
||
|
PAND X8, X1 // X0: high input
|
||
|
PSHUFB X0, X2 // X2: mul low part
|
||
|
PSHUFB X1, X3 // X3: mul high part
|
||
|
PXOR X2, X3 // X3: Result
|
||
|
MOVOU X3, (DX) // Store
|
||
|
ADDQ $16, SI // in+=16
|
||
|
ADDQ $16, DX // out+=16
|
||
|
SUBQ $1, R9
|
||
|
JNZ loopback
|
||
|
|
||
|
done:
|
||
|
RET
|
||
|
|
||
|
// func galMulAVX2Xor(low, high, in, out []byte)
|
||
|
TEXT ·galMulAVX2Xor(SB), 7, $0
|
||
|
MOVQ low+0(FP), SI // SI: &low
|
||
|
MOVQ high+24(FP), DX // DX: &high
|
||
|
MOVQ $15, BX // BX: low mask
|
||
|
MOVQ BX, X5
|
||
|
MOVOU (SI), X6 // X6: low
|
||
|
MOVOU (DX), X7 // X7: high
|
||
|
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||
|
|
||
|
VINSERTI128 $1, X6, Y6, Y6 // low
|
||
|
VINSERTI128 $1, X7, Y7, Y7 // high
|
||
|
VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
|
||
|
|
||
|
SHRQ $5, R9 // len(in) / 32
|
||
|
MOVQ out+72(FP), DX // DX: &out
|
||
|
MOVQ in+48(FP), SI // SI: &in
|
||
|
TESTQ R9, R9
|
||
|
JZ done_xor_avx2
|
||
|
|
||
|
loopback_xor_avx2:
|
||
|
VMOVDQU (SI), Y0
|
||
|
VMOVDQU (DX), Y4
|
||
|
VPSRLQ $4, Y0, Y1 // Y1: high input
|
||
|
VPAND Y8, Y0, Y0 // Y0: low input
|
||
|
VPAND Y8, Y1, Y1 // Y1: high input
|
||
|
VPSHUFB Y0, Y6, Y2 // Y2: mul low part
|
||
|
VPSHUFB Y1, Y7, Y3 // Y3: mul high part
|
||
|
VPXOR Y3, Y2, Y3 // Y3: Result
|
||
|
VPXOR Y4, Y3, Y4 // Y4: Result
|
||
|
VMOVDQU Y4, (DX)
|
||
|
|
||
|
ADDQ $32, SI // in+=32
|
||
|
ADDQ $32, DX // out+=32
|
||
|
SUBQ $1, R9
|
||
|
JNZ loopback_xor_avx2
|
||
|
|
||
|
done_xor_avx2:
|
||
|
VZEROUPPER
|
||
|
RET
|
||
|
|
||
|
// func galMulAVX2(low, high, in, out []byte)
|
||
|
TEXT ·galMulAVX2(SB), 7, $0
|
||
|
MOVQ low+0(FP), SI // SI: &low
|
||
|
MOVQ high+24(FP), DX // DX: &high
|
||
|
MOVQ $15, BX // BX: low mask
|
||
|
MOVQ BX, X5
|
||
|
MOVOU (SI), X6 // X6: low
|
||
|
MOVOU (DX), X7 // X7: high
|
||
|
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||
|
|
||
|
VINSERTI128 $1, X6, Y6, Y6 // low
|
||
|
VINSERTI128 $1, X7, Y7, Y7 // high
|
||
|
VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
|
||
|
|
||
|
SHRQ $5, R9 // len(in) / 32
|
||
|
MOVQ out+72(FP), DX // DX: &out
|
||
|
MOVQ in+48(FP), SI // SI: &in
|
||
|
TESTQ R9, R9
|
||
|
JZ done_avx2
|
||
|
|
||
|
loopback_avx2:
|
||
|
VMOVDQU (SI), Y0
|
||
|
VPSRLQ $4, Y0, Y1 // Y1: high input
|
||
|
VPAND Y8, Y0, Y0 // Y0: low input
|
||
|
VPAND Y8, Y1, Y1 // Y1: high input
|
||
|
VPSHUFB Y0, Y6, Y2 // Y2: mul low part
|
||
|
VPSHUFB Y1, Y7, Y3 // Y3: mul high part
|
||
|
VPXOR Y3, Y2, Y4 // Y4: Result
|
||
|
VMOVDQU Y4, (DX)
|
||
|
|
||
|
ADDQ $32, SI // in+=32
|
||
|
ADDQ $32, DX // out+=32
|
||
|
SUBQ $1, R9
|
||
|
JNZ loopback_avx2
|
||
|
|
||
|
done_avx2:
|
||
|
VZEROUPPER
|
||
|
RET
|
||
|
|
||
|
// func galMulAVX2Xor_64(low, high, in, out []byte)
|
||
|
TEXT ·galMulAVX2Xor_64(SB), 7, $0
|
||
|
MOVQ low+0(FP), SI // SI: &low
|
||
|
MOVQ high+24(FP), DX // DX: &high
|
||
|
MOVQ $15, BX // BX: low mask
|
||
|
MOVQ BX, X5
|
||
|
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||
|
|
||
|
VBROADCASTI128 (SI), Y6 // low table
|
||
|
VBROADCASTI128 (DX), Y7 // high high table
|
||
|
VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
|
||
|
|
||
|
SHRQ $6, R9 // len(in) / 64
|
||
|
MOVQ out+72(FP), DX // DX: &out
|
||
|
MOVQ in+48(FP), SI // SI: &in
|
||
|
TESTQ R9, R9
|
||
|
JZ done_xor_avx2_64
|
||
|
|
||
|
loopback_xor_avx2_64:
|
||
|
VMOVDQU (SI), Y0
|
||
|
VMOVDQU 32(SI), Y10
|
||
|
VMOVDQU (DX), Y4
|
||
|
VMOVDQU 32(DX), Y14
|
||
|
VPSRLQ $4, Y0, Y1 // Y1: high input
|
||
|
VPSRLQ $4, Y10, Y11 // Y11: high input 2
|
||
|
VPAND Y8, Y0, Y0 // Y0: low input
|
||
|
VPAND Y8, Y10, Y10 // Y10: low input 2
|
||
|
VPAND Y8, Y1, Y1 // Y11: high input
|
||
|
VPAND Y8, Y11, Y11 // Y11: high input 2
|
||
|
VPSHUFB Y0, Y6, Y2 // Y2: mul low part
|
||
|
VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2
|
||
|
VPSHUFB Y1, Y7, Y3 // Y3: mul high part
|
||
|
VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2
|
||
|
VPXOR Y3, Y2, Y3 // Y3: Result
|
||
|
VPXOR Y13, Y12, Y13 // Y13: Result 2
|
||
|
VPXOR Y4, Y3, Y4 // Y4: Result
|
||
|
VPXOR Y14, Y13, Y14 // Y4: Result 2
|
||
|
VMOVDQU Y4, (DX)
|
||
|
VMOVDQU Y14, 32(DX)
|
||
|
|
||
|
ADDQ $64, SI // in+=64
|
||
|
ADDQ $64, DX // out+=64
|
||
|
SUBQ $1, R9
|
||
|
JNZ loopback_xor_avx2_64
|
||
|
|
||
|
done_xor_avx2_64:
|
||
|
VZEROUPPER
|
||
|
RET
|
||
|
|
||
|
// func galMulAVX2_64(low, high, in, out []byte)
|
||
|
TEXT ·galMulAVX2_64(SB), 7, $0
|
||
|
MOVQ low+0(FP), SI // SI: &low
|
||
|
MOVQ high+24(FP), DX // DX: &high
|
||
|
MOVQ $15, BX // BX: low mask
|
||
|
MOVQ BX, X5
|
||
|
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||
|
VBROADCASTI128 (SI), Y6 // low table
|
||
|
VBROADCASTI128 (DX), Y7 // high high table
|
||
|
VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
|
||
|
|
||
|
SHRQ $6, R9 // len(in) / 64
|
||
|
MOVQ out+72(FP), DX // DX: &out
|
||
|
MOVQ in+48(FP), SI // SI: &in
|
||
|
TESTQ R9, R9
|
||
|
JZ done_avx2_64
|
||
|
|
||
|
loopback_avx2_64:
|
||
|
VMOVDQU (SI), Y0
|
||
|
VMOVDQU 32(SI), Y10
|
||
|
VPSRLQ $4, Y0, Y1 // Y1: high input
|
||
|
VPSRLQ $4, Y10, Y11 // Y11: high input 2
|
||
|
VPAND Y8, Y0, Y0 // Y0: low input
|
||
|
VPAND Y8, Y10, Y10 // Y10: low input
|
||
|
VPAND Y8, Y1, Y1 // Y1: high input
|
||
|
VPAND Y8, Y11, Y11 // Y11: high input 2
|
||
|
VPSHUFB Y0, Y6, Y2 // Y2: mul low part
|
||
|
VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2
|
||
|
VPSHUFB Y1, Y7, Y3 // Y3: mul high part
|
||
|
VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2
|
||
|
VPXOR Y3, Y2, Y4 // Y4: Result
|
||
|
VPXOR Y13, Y12, Y14 // Y14: Result 2
|
||
|
VMOVDQU Y4, (DX)
|
||
|
VMOVDQU Y14, 32(DX)
|
||
|
|
||
|
ADDQ $64, SI // in+=64
|
||
|
ADDQ $64, DX // out+=64
|
||
|
SUBQ $1, R9
|
||
|
JNZ loopback_avx2_64
|
||
|
|
||
|
done_avx2_64:
|
||
|
VZEROUPPER
|
||
|
RET
|