mirror of https://github.com/status-im/leopard.git
76 lines
2.2 KiB
Plaintext
76 lines
2.2 KiB
Plaintext
|
static inline void FWHT_4(GFSymbol* data)
|
||
|
{
|
||
|
#if 1
|
||
|
GFSymbol t0 = data[0];
|
||
|
GFSymbol t1 = data[1];
|
||
|
GFSymbol t2 = data[2];
|
||
|
GFSymbol t3 = data[3];
|
||
|
FWHT_2(t0, t1);
|
||
|
FWHT_2(t2, t3);
|
||
|
FWHT_2(t0, t2);
|
||
|
FWHT_2(t1, t3);
|
||
|
data[0] = t0;
|
||
|
data[1] = t1;
|
||
|
data[2] = t2;
|
||
|
data[3] = t3;
|
||
|
#else
|
||
|
LHC_M128 * LHC_RESTRICT data64 = reinterpret_cast<LHC_M128 *>(data);
|
||
|
|
||
|
const LHC_M128 mask = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
|
||
|
|
||
|
// Load 8 bytes of data into low half of register
|
||
|
// input = { t0 t1 t2 t3 0 0 0 0 }
|
||
|
LHC_M128 input = _mm_loadl_epi64(data64);
|
||
|
|
||
|
// FWHT_2(t0, t1); FWHT_2(t2, t3);
|
||
|
|
||
|
// t0t2 = { t0 0 t2 0 0 0 0 0 }
|
||
|
LHC_M128 t0t2 = _mm_and_si128(input, mask);
|
||
|
|
||
|
// t1t3 = { t1 0 t3 0 0 0 0 0 }
|
||
|
input = _mm_srli_epi32(input, 16);
|
||
|
|
||
|
// {input, sum} = {t0t2 - t1t3, t0t2 - t1t3}
|
||
|
LHC_M128 sum = _mm_add_epi32(t0t2, input);
|
||
|
input = _mm_sub_epi32(t0t2, input);
|
||
|
|
||
|
if (CpuHasSSSE3)
|
||
|
{
|
||
|
// Reduce to 16-bit values
|
||
|
// input = { t0 + t1, t2 + t3, 0, 0, t0 - t1, t2 - t3, 0, 0 }
|
||
|
// input = { t0, t2, 0, 0, t1, t3, 0, 0 }
|
||
|
input = _mm_hadd_epi16(sum, input);
|
||
|
|
||
|
// FWHT_2(t0, t2); FWHT_2(t1, t3);
|
||
|
|
||
|
// Spread the values out
|
||
|
// input = { t0, 0, t2, 0, t1, 0, t3, 0 }
|
||
|
input = _mm_shufflelo_epi16(input, _MM_SHUFFLE(3, 1, 2, 0));
|
||
|
input = _mm_shufflehi_epi16(input, _MM_SHUFFLE(3, 1, 2, 0));
|
||
|
|
||
|
// Compute 32-bit sums with overflows
|
||
|
// sum = { t0 + t2, t1 + t3, undef, undef } (32-bit overflows)
|
||
|
sum = _mm_hadd_epi32(input, input);
|
||
|
|
||
|
// Compute 32-bit difs with overflows
|
||
|
// input = { t0 - t2, t1 - t3, undef, undef } (32-bit overflows)
|
||
|
input = _mm_hsub_epi32(input, input);
|
||
|
|
||
|
// Reduce to 16-bit values
|
||
|
// input = { t0 + t2, t1 + t3, t0 + t2, t1 + t3, t0 - t2, t1 - t3, t0 - t2, t1 - t3 } (16-bit reduced)
|
||
|
input = _mm_hadd_epi16(sum, input);
|
||
|
|
||
|
// input = { t0 + t2, t1 + t3, t0 - t2, t1 - t3, undef, undef, undef, undef }
|
||
|
input = _mm_shuffle_epi32(input, _MM_SHUFFLE(3, 1, 2, 0));
|
||
|
|
||
|
// Store 8 bytes of data
|
||
|
_mm_storel_epi64(data64, input);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
LHC_DEBUG_BREAK; // FIXME
|
||
|
|
||
|
// FWHT_2(t0, t2); FWHT_2(t1, t3);
|
||
|
}
|
||
|
#endif
|
||
|
}
|