mirror of
https://github.com/status-im/status-go.git
synced 2025-01-19 19:20:00 +00:00
584 lines
13 KiB
Go
584 lines
13 KiB
Go
//go:build !noasm && !appengine && !gccgo && !nopshufb
|
|
|
|
// Copyright 2015, Klaus Post, see LICENSE for details.
|
|
|
|
package reedsolomon
|
|
|
|
const pshufb = true
|
|
|
|
//go:noescape
|
|
func galMulSSSE3(low, high, in, out []byte)
|
|
|
|
//go:noescape
|
|
func galMulSSSE3Xor(low, high, in, out []byte)
|
|
|
|
//go:noescape
|
|
func galMulAVX2Xor(low, high, in, out []byte)
|
|
|
|
//go:noescape
|
|
func galMulAVX2(low, high, in, out []byte)
|
|
|
|
//go:noescape
|
|
func galMulAVX2Xor_64(low, high, in, out []byte)
|
|
|
|
//go:noescape
|
|
func galMulAVX2_64(low, high, in, out []byte)
|
|
|
|
// This is what the assembler routines do in blocks of 16 bytes:
|
|
/*
|
|
func galMulSSSE3(low, high, in, out []byte) {
|
|
for n, input := range in {
|
|
l := input & 0xf
|
|
h := input >> 4
|
|
out[n] = low[l] ^ high[h]
|
|
}
|
|
}
|
|
|
|
func galMulSSSE3Xor(low, high, in, out []byte) {
|
|
for n, input := range in {
|
|
l := input & 0xf
|
|
h := input >> 4
|
|
out[n] ^= low[l] ^ high[h]
|
|
}
|
|
}
|
|
*/
|
|
|
|
// bigSwitchover is the size where 64 bytes are processed per loop.
|
|
const bigSwitchover = 128
|
|
|
|
func galMulSlice(c byte, in, out []byte, o *options) {
|
|
if c == 1 {
|
|
copy(out, in)
|
|
return
|
|
}
|
|
if o.useAVX2 {
|
|
if len(in) >= bigSwitchover {
|
|
galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
|
done := (len(in) >> 6) << 6
|
|
in = in[done:]
|
|
out = out[done:]
|
|
}
|
|
if len(in) > 32 {
|
|
galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
|
done := (len(in) >> 5) << 5
|
|
in = in[done:]
|
|
out = out[done:]
|
|
}
|
|
} else if o.useSSSE3 {
|
|
galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
|
done := (len(in) >> 4) << 4
|
|
in = in[done:]
|
|
out = out[done:]
|
|
}
|
|
out = out[:len(in)]
|
|
mt := mulTable[c][:256]
|
|
for i := range in {
|
|
out[i] = mt[in[i]]
|
|
}
|
|
}
|
|
|
|
func galMulSliceXor(c byte, in, out []byte, o *options) {
|
|
if c == 1 {
|
|
sliceXor(in, out, o)
|
|
return
|
|
}
|
|
|
|
if o.useAVX2 {
|
|
if len(in) >= bigSwitchover {
|
|
galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
|
done := (len(in) >> 6) << 6
|
|
in = in[done:]
|
|
out = out[done:]
|
|
}
|
|
if len(in) >= 32 {
|
|
galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
|
done := (len(in) >> 5) << 5
|
|
in = in[done:]
|
|
out = out[done:]
|
|
}
|
|
} else if o.useSSSE3 {
|
|
galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
|
done := (len(in) >> 4) << 4
|
|
in = in[done:]
|
|
out = out[done:]
|
|
}
|
|
if len(in) == 0 {
|
|
return
|
|
}
|
|
out = out[:len(in)]
|
|
mt := mulTable[c][:256]
|
|
for i := range in {
|
|
out[i] ^= mt[in[i]]
|
|
}
|
|
}
|
|
|
|
// simple slice xor
|
|
func sliceXor(in, out []byte, o *options) {
|
|
if o.useSSE2 {
|
|
if len(in) >= bigSwitchover {
|
|
if o.useAVX2 {
|
|
avx2XorSlice_64(in, out)
|
|
done := (len(in) >> 6) << 6
|
|
in = in[done:]
|
|
out = out[done:]
|
|
} else {
|
|
sSE2XorSlice_64(in, out)
|
|
done := (len(in) >> 6) << 6
|
|
in = in[done:]
|
|
out = out[done:]
|
|
}
|
|
}
|
|
if len(in) >= 16 {
|
|
sSE2XorSlice(in, out)
|
|
done := (len(in) >> 4) << 4
|
|
in = in[done:]
|
|
out = out[done:]
|
|
}
|
|
} else {
|
|
sliceXorGo(in, out, o)
|
|
return
|
|
}
|
|
out = out[:len(in)]
|
|
for i := range in {
|
|
out[i] ^= in[i]
|
|
}
|
|
}
|
|
|
|
// 4-way butterfly
|
|
func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
|
|
if len(work[0]) == 0 {
|
|
return
|
|
}
|
|
|
|
t01 := &multiply256LUT[log_m01]
|
|
t23 := &multiply256LUT[log_m23]
|
|
t02 := &multiply256LUT[log_m02]
|
|
if o.useAVX512 {
|
|
if log_m01 == modulus {
|
|
if log_m23 == modulus {
|
|
if log_m02 == modulus {
|
|
ifftDIT4_avx512_7(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT4_avx512_3(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m02 == modulus {
|
|
ifftDIT4_avx512_5(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT4_avx512_1(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
} else {
|
|
if log_m23 == modulus {
|
|
if log_m02 == modulus {
|
|
ifftDIT4_avx512_6(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT4_avx512_2(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m02 == modulus {
|
|
ifftDIT4_avx512_4(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT4_avx512_0(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
}
|
|
return
|
|
} else if o.useAVX2 {
|
|
if log_m01 == modulus {
|
|
if log_m23 == modulus {
|
|
if log_m02 == modulus {
|
|
ifftDIT4_avx2_7(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT4_avx2_3(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m02 == modulus {
|
|
ifftDIT4_avx2_5(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT4_avx2_1(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
} else {
|
|
if log_m23 == modulus {
|
|
if log_m02 == modulus {
|
|
ifftDIT4_avx2_6(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT4_avx2_2(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m02 == modulus {
|
|
ifftDIT4_avx2_4(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT4_avx2_0(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
|
|
}
|
|
|
|
// 4-way butterfly
|
|
func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
|
|
if len(work[0]) == 0 {
|
|
return
|
|
}
|
|
|
|
if false && o.useAvx512GFNI {
|
|
// Note that these currently require that length is multiple of 64.
|
|
t01 := gf2p811dMulMatrices[log_m01]
|
|
t23 := gf2p811dMulMatrices[log_m23]
|
|
t02 := gf2p811dMulMatrices[log_m02]
|
|
if log_m01 == modulus8 {
|
|
if log_m23 == modulus8 {
|
|
if log_m02 == modulus8 {
|
|
ifftDIT48_gfni_7(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT48_gfni_3(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m02 == modulus8 {
|
|
ifftDIT48_gfni_5(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT48_gfni_1(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
} else {
|
|
if log_m23 == modulus8 {
|
|
if log_m02 == modulus8 {
|
|
ifftDIT48_gfni_6(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT48_gfni_2(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m02 == modulus8 {
|
|
ifftDIT48_gfni_4(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT48_gfni_0(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
if o.useAVX2 {
|
|
// Note that these currently require that length is multiple of 64.
|
|
t01 := &multiply256LUT8[log_m01]
|
|
t23 := &multiply256LUT8[log_m23]
|
|
t02 := &multiply256LUT8[log_m02]
|
|
if log_m01 == modulus8 {
|
|
if log_m23 == modulus8 {
|
|
if log_m02 == modulus8 {
|
|
ifftDIT48_avx2_7(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT48_avx2_3(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m02 == modulus8 {
|
|
ifftDIT48_avx2_5(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT48_avx2_1(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
} else {
|
|
if log_m23 == modulus8 {
|
|
if log_m02 == modulus8 {
|
|
ifftDIT48_avx2_6(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT48_avx2_2(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m02 == modulus8 {
|
|
ifftDIT48_avx2_4(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
ifftDIT48_avx2_0(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
ifftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
|
|
}
|
|
|
|
func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
|
|
if len(work[0]) == 0 {
|
|
return
|
|
}
|
|
|
|
t01 := &multiply256LUT[log_m01]
|
|
t23 := &multiply256LUT[log_m23]
|
|
t02 := &multiply256LUT[log_m02]
|
|
if o.useAVX512 {
|
|
if log_m02 == modulus {
|
|
if log_m01 == modulus {
|
|
if log_m23 == modulus {
|
|
fftDIT4_avx512_7(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT4_avx512_3(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m23 == modulus {
|
|
fftDIT4_avx512_5(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT4_avx512_1(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
} else {
|
|
if log_m01 == modulus {
|
|
if log_m23 == modulus {
|
|
fftDIT4_avx512_6(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT4_avx512_2(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m23 == modulus {
|
|
fftDIT4_avx512_4(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT4_avx512_0(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
}
|
|
return
|
|
} else if o.useAVX2 {
|
|
if log_m02 == modulus {
|
|
if log_m01 == modulus {
|
|
if log_m23 == modulus {
|
|
fftDIT4_avx2_7(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT4_avx2_3(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m23 == modulus {
|
|
fftDIT4_avx2_5(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT4_avx2_1(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
} else {
|
|
if log_m01 == modulus {
|
|
if log_m23 == modulus {
|
|
fftDIT4_avx2_6(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT4_avx2_2(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m23 == modulus {
|
|
fftDIT4_avx2_4(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT4_avx2_0(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
|
|
}
|
|
|
|
// 4-way butterfly
|
|
func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *options) {
|
|
if len(work[0]) == 0 {
|
|
return
|
|
}
|
|
|
|
if false && o.useAvx512GFNI {
|
|
t01 := gf2p811dMulMatrices[log_m01]
|
|
t23 := gf2p811dMulMatrices[log_m23]
|
|
t02 := gf2p811dMulMatrices[log_m02]
|
|
// Note that these currently require that length is multiple of 64.
|
|
if log_m02 == modulus8 {
|
|
if log_m01 == modulus8 {
|
|
if log_m23 == modulus8 {
|
|
fftDIT48_gfni_7(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT48_gfni_3(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m23 == modulus8 {
|
|
fftDIT48_gfni_5(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT48_gfni_1(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
} else {
|
|
if log_m01 == modulus8 {
|
|
if log_m23 == modulus8 {
|
|
fftDIT48_gfni_6(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT48_gfni_2(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m23 == modulus8 {
|
|
fftDIT48_gfni_4(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT48_gfni_0(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
if o.useAVX2 {
|
|
t01 := &multiply256LUT8[log_m01]
|
|
t23 := &multiply256LUT8[log_m23]
|
|
t02 := &multiply256LUT8[log_m02]
|
|
// Note that these currently require that length is multiple of 64.
|
|
if log_m02 == modulus8 {
|
|
if log_m01 == modulus8 {
|
|
if log_m23 == modulus8 {
|
|
fftDIT48_avx2_7(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT48_avx2_3(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m23 == modulus8 {
|
|
fftDIT48_avx2_5(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT48_avx2_1(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
} else {
|
|
if log_m01 == modulus8 {
|
|
if log_m23 == modulus8 {
|
|
fftDIT48_avx2_6(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT48_avx2_2(work, dist*24, t01, t23, t02)
|
|
}
|
|
} else {
|
|
if log_m23 == modulus8 {
|
|
fftDIT48_avx2_4(work, dist*24, t01, t23, t02)
|
|
} else {
|
|
fftDIT48_avx2_0(work, dist*24, t01, t23, t02)
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
fftDIT4Ref8(work, dist, log_m01, log_m23, log_m02, o)
|
|
}
|
|
|
|
// 2-way butterfly forward
|
|
func fftDIT2(x, y []byte, log_m ffe, o *options) {
|
|
if len(x) == 0 {
|
|
return
|
|
}
|
|
if o.useAVX2 {
|
|
tmp := &multiply256LUT[log_m]
|
|
fftDIT2_avx2(x, y, tmp)
|
|
} else if o.useSSSE3 {
|
|
tmp := &multiply256LUT[log_m]
|
|
fftDIT2_ssse3(x, y, tmp)
|
|
} else {
|
|
// Reference version:
|
|
refMulAdd(x, y, log_m)
|
|
sliceXor(x, y, o)
|
|
}
|
|
}
|
|
|
|
// 2-way butterfly forward
|
|
func fftDIT28(x, y []byte, log_m ffe8, o *options) {
|
|
if len(x) == 0 {
|
|
return
|
|
}
|
|
|
|
if o.useAVX2 {
|
|
fftDIT28_avx2(x, y, &multiply256LUT8[log_m])
|
|
if len(x)&63 == 0 {
|
|
return
|
|
}
|
|
done := (len(y) >> 6) << 6
|
|
y = y[done:]
|
|
x = x[done:]
|
|
}
|
|
mulAdd8(x, y, log_m, o)
|
|
sliceXor(x, y, o)
|
|
}
|
|
|
|
// 2-way butterfly inverse
|
|
func ifftDIT28(x, y []byte, log_m ffe8, o *options) {
|
|
if len(x) == 0 {
|
|
return
|
|
}
|
|
|
|
if o.useAVX2 {
|
|
ifftDIT28_avx2(x, y, &multiply256LUT8[log_m])
|
|
if len(x)&63 == 0 {
|
|
return
|
|
}
|
|
done := (len(y) >> 6) << 6
|
|
y = y[done:]
|
|
x = x[done:]
|
|
}
|
|
sliceXor(x, y, o)
|
|
mulAdd8(x, y, log_m, o)
|
|
}
|
|
|
|
func mulAdd8(x, y []byte, log_m ffe8, o *options) {
|
|
if o.useAVX2 {
|
|
t := &multiply256LUT8[log_m]
|
|
galMulAVX2Xor_64(t[:16], t[16:32], y, x)
|
|
done := (len(y) >> 6) << 6
|
|
y = y[done:]
|
|
x = x[done:]
|
|
} else if o.useSSSE3 {
|
|
t := &multiply256LUT8[log_m]
|
|
galMulSSSE3Xor(t[:16], t[16:32], y, x)
|
|
done := (len(y) >> 4) << 4
|
|
y = y[done:]
|
|
x = x[done:]
|
|
}
|
|
refMulAdd8(x, y, log_m)
|
|
}
|
|
|
|
// 2-way butterfly
|
|
func ifftDIT2(x, y []byte, log_m ffe, o *options) {
|
|
if len(x) == 0 {
|
|
return
|
|
}
|
|
if o.useAVX2 {
|
|
tmp := &multiply256LUT[log_m]
|
|
ifftDIT2_avx2(x, y, tmp)
|
|
} else if o.useSSSE3 {
|
|
tmp := &multiply256LUT[log_m]
|
|
ifftDIT2_ssse3(x, y, tmp)
|
|
} else {
|
|
// Reference version:
|
|
sliceXor(x, y, o)
|
|
refMulAdd(x, y, log_m)
|
|
}
|
|
}
|
|
|
|
func mulgf16(x, y []byte, log_m ffe, o *options) {
|
|
if len(x) == 0 {
|
|
return
|
|
}
|
|
if o.useAVX2 {
|
|
tmp := &multiply256LUT[log_m]
|
|
mulgf16_avx2(x, y, tmp)
|
|
} else if o.useSSSE3 {
|
|
tmp := &multiply256LUT[log_m]
|
|
mulgf16_ssse3(x, y, tmp)
|
|
} else {
|
|
refMul(x, y, log_m)
|
|
}
|
|
}
|
|
|
|
func mulgf8(out, in []byte, log_m ffe8, o *options) {
|
|
if o.useAVX2 {
|
|
t := &multiply256LUT8[log_m]
|
|
galMulAVX2_64(t[:16], t[16:32], in, out)
|
|
done := (len(in) >> 6) << 6
|
|
in = in[done:]
|
|
out = out[done:]
|
|
} else if o.useSSSE3 {
|
|
t := &multiply256LUT8[log_m]
|
|
galMulSSSE3(t[:16], t[16:32], in, out)
|
|
done := (len(in) >> 4) << 4
|
|
in = in[done:]
|
|
out = out[done:]
|
|
}
|
|
out = out[:len(in)]
|
|
mt := mul8LUTs[log_m].Value[:]
|
|
for i := range in {
|
|
out[i] = byte(mt[in[i]])
|
|
}
|
|
}
|