mirror of
https://github.com/logos-storage/outsourcing-Reed-Solomon.git
synced 2026-01-02 05:33:06 +00:00
speed up NTT by another almost 10% by using optimized short DFT routines
This commit is contained in:
parent
2adca16e10
commit
ae3c9827d3
@ -6,41 +6,57 @@
|
||||
#include <assert.h>
|
||||
|
||||
#include "goldilocks.h"
|
||||
#include "short_dft.h"
|
||||
#include "ntt.h"
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
void goldilocks_ntt_forward_noalloc(int m, int src_stride, const uint64_t *gpows, const uint64_t *src, uint64_t *buf, uint64_t *tgt) {
|
||||
|
||||
if (m==0) {
|
||||
tgt[0] = src[0];
|
||||
return;
|
||||
}
|
||||
switch(m) {
|
||||
|
||||
if (m==1) {
|
||||
// N = 2
|
||||
tgt[0] = goldilocks_add( src[0] , src[src_stride] ); // x + y
|
||||
tgt[1] = goldilocks_sub( src[0] , src[src_stride] ); // x - y
|
||||
return;
|
||||
}
|
||||
case 4:
|
||||
short_fwd_DFT_size_16( src_stride, 1, src, tgt );
|
||||
break;
|
||||
|
||||
else {
|
||||
int N = (1<< m );
|
||||
int halfN = (1<<(m-1));
|
||||
case 3:
|
||||
short_fwd_DFT_size_8( src_stride, 1, src, tgt );
|
||||
break;
|
||||
|
||||
goldilocks_ntt_forward_noalloc( m-1 , src_stride<<1 , gpows , src , buf + N , buf );
|
||||
goldilocks_ntt_forward_noalloc( m-1 , src_stride<<1 , gpows , src + src_stride , buf + N , buf + halfN );
|
||||
case 2:
|
||||
short_fwd_DFT_size_4( src_stride, 1, src, tgt );
|
||||
break;
|
||||
|
||||
for(int j=0; j<halfN; j++) {
|
||||
const uint64_t gpow = gpows[j*src_stride];
|
||||
tgt[j ] = goldilocks_mul( buf[j+halfN] , gpow ); // g*v[k]
|
||||
tgt[j+halfN] = goldilocks_neg( tgt[j ] ); // - g*v[k]
|
||||
tgt[j ] = goldilocks_add( tgt[j ] , buf[j] ); // u[k] + g*v[k]
|
||||
tgt[j+halfN] = goldilocks_add( tgt[j+halfN] , buf[j] ); // u[k] - g*v[k]
|
||||
case 1:
|
||||
// N = 2
|
||||
tgt[0] = goldilocks_add( src[0] , src[src_stride] ); // x + y
|
||||
tgt[1] = goldilocks_sub( src[0] , src[src_stride] ); // x - y
|
||||
return;
|
||||
|
||||
case 0:
|
||||
tgt[0] = src[0];
|
||||
return;
|
||||
|
||||
default: {
|
||||
|
||||
int N = (1<< m );
|
||||
int halfN = (1<<(m-1));
|
||||
|
||||
goldilocks_ntt_forward_noalloc( m-1 , src_stride<<1 , gpows , src , buf + N , buf );
|
||||
goldilocks_ntt_forward_noalloc( m-1 , src_stride<<1 , gpows , src + src_stride , buf + N , buf + halfN );
|
||||
|
||||
for(int j=0; j<halfN; j++) {
|
||||
const uint64_t gpow = gpows[j*src_stride];
|
||||
tgt[j ] = goldilocks_mul( buf[j+halfN] , gpow ); // g^k * v[k]
|
||||
tgt[j+halfN] = goldilocks_neg( tgt[j ] ); // - g^k * v[k]
|
||||
tgt[j ] = goldilocks_add( tgt[j ] , buf[j] ); // u[k] + g^k * v[k]
|
||||
tgt[j+halfN] = goldilocks_add( tgt[j+halfN] , buf[j] ); // u[k] - g^k * v[k]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// forward number-theoretical transform (evaluation of a polynomial)
|
||||
// `src` and `tgt` should be `N = 2^m` sized arrays of field elements
|
||||
// `gen` should be the generator of the multiplicative subgroup sized `N`
|
||||
@ -154,32 +170,46 @@ const uint64_t goldilocks_oneHalf = 0x7fffffff80000001ull;
|
||||
|
||||
void goldilocks_ntt_inverse_noalloc(int m, int tgt_stride, const uint64_t *gpows, const uint64_t *src, uint64_t *buf, uint64_t *tgt) {
|
||||
|
||||
if (m==0) {
|
||||
tgt[0] = src[0];
|
||||
return;
|
||||
}
|
||||
switch(m) {
|
||||
|
||||
if (m==1) {
|
||||
// N = 2
|
||||
case 0:
|
||||
tgt[0] = src[0];
|
||||
break;
|
||||
|
||||
tgt[0 ] = goldilocks_add( src[0] , src[1] ); // x + y
|
||||
tgt[tgt_stride] = goldilocks_sub( src[0] , src[1] ); // x - y
|
||||
return;
|
||||
}
|
||||
case 1:
|
||||
tgt[0 ] = goldilocks_add( src[0] , src[1] ); // x + y
|
||||
tgt[tgt_stride] = goldilocks_sub( src[0] , src[1] ); // x - y
|
||||
break;
|
||||
|
||||
else {
|
||||
int N = (1<< m );
|
||||
int halfN = (1<<(m-1));
|
||||
case 2:
|
||||
short_inv_DFT_size_4_unscaled( 1, tgt_stride, src, tgt );
|
||||
break;
|
||||
|
||||
for(int j=0; j<halfN; j++) {
|
||||
uint64_t gpow = gpows[j*tgt_stride];
|
||||
buf[j ] = goldilocks_add( src[j] , src[j+halfN] ); // x + y
|
||||
buf[j+halfN] = goldilocks_sub( src[j] , src[j+halfN] ); // x - y
|
||||
buf[j+halfN] = goldilocks_mul( buf[j+halfN] , gpow ); // (x - y) / g^k
|
||||
case 3:
|
||||
short_inv_DFT_size_8_unscaled( 1, tgt_stride, src, tgt );
|
||||
break;
|
||||
|
||||
case 4:
|
||||
short_inv_DFT_size_16_unscaled( 1, tgt_stride, src, tgt );
|
||||
break;
|
||||
|
||||
default: {
|
||||
|
||||
int N = (1<< m );
|
||||
int halfN = (1<<(m-1));
|
||||
|
||||
for(int j=0; j<halfN; j++) {
|
||||
uint64_t gpow = gpows[j*tgt_stride];
|
||||
buf[j ] = goldilocks_add( src[j] , src[j+halfN] ); // x + y
|
||||
buf[j+halfN] = goldilocks_sub( src[j] , src[j+halfN] ); // x - y
|
||||
buf[j+halfN] = goldilocks_mul( buf[j+halfN] , gpow ); // (x - y) / g^k
|
||||
}
|
||||
|
||||
goldilocks_ntt_inverse_noalloc( m-1 , tgt_stride<<1 , gpows , buf , buf + N , tgt );
|
||||
goldilocks_ntt_inverse_noalloc( m-1 , tgt_stride<<1 , gpows , buf + halfN , buf + N , tgt + tgt_stride );
|
||||
break;
|
||||
}
|
||||
|
||||
goldilocks_ntt_inverse_noalloc( m-1 , tgt_stride<<1 , gpows , buf , buf + N , tgt );
|
||||
goldilocks_ntt_inverse_noalloc( m-1 , tgt_stride<<1 , gpows , buf + halfN , buf + N , tgt + tgt_stride );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -30,7 +30,7 @@ const uint64_t IDFT4_INV_OMEGA = 0xfffeffff00000001 ;
|
||||
const uint64_t IDFT4_J = 0x0001000000000000 ;
|
||||
const uint64_t IDFT4_INV_4 = 0xbfffffff40000001 ;
|
||||
|
||||
void short_inv_DFT_size_4_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
||||
void short_inv_DFT_size_4_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||
|
||||
int src_stride2 = src_stride + src_stride ;
|
||||
int src_stride3 = src_stride2 + src_stride ;
|
||||
@ -57,7 +57,7 @@ void short_inv_DFT_size_4_unscaled( int src_stride, int tgt_stride, uint64_t *sr
|
||||
|
||||
}
|
||||
|
||||
void short_fwd_DFT_size_4( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
||||
void short_fwd_DFT_size_4( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||
short_inv_DFT_size_4_unscaled( src_stride, tgt_stride, src, tgt );
|
||||
|
||||
int tgt_stride3 = 3*tgt_stride;
|
||||
@ -67,7 +67,7 @@ void short_fwd_DFT_size_4( int src_stride, int tgt_stride, uint64_t *src, uint64
|
||||
tgt[tgt_stride3] = tmp;
|
||||
}
|
||||
|
||||
void short_inv_DFT_size_4_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
||||
void short_inv_DFT_size_4_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||
|
||||
short_inv_DFT_size_4_unscaled( src_stride, tgt_stride, src, tgt );
|
||||
|
||||
@ -89,7 +89,7 @@ const uint64_t DFT8_J = 0x0001000000000000 ;
|
||||
const uint64_t DFT8_COS_U = 0xffffff7f00800081 ;
|
||||
const uint64_t DFT8_MINUS_J_SIN_U = 0xffffff7eff800081 ;
|
||||
|
||||
void short_fwd_DFT_size_8( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
||||
void short_fwd_DFT_size_8( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||
// u = 2pi/8
|
||||
// omega = cos(u) + i*sin(u)
|
||||
//
|
||||
@ -164,7 +164,7 @@ const uint64_t IDFT8_INV_8 = 0xdfffffff20000001 ;
|
||||
|
||||
//--------------------------------------
|
||||
|
||||
void short_inv_DFT_size_8_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
||||
void short_inv_DFT_size_8_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||
// u = 2pi/8
|
||||
// omega = cos(u) + i*sin(u)
|
||||
//
|
||||
@ -231,7 +231,7 @@ void short_inv_DFT_size_8_unscaled( int src_stride, int tgt_stride, uint64_t *sr
|
||||
|
||||
//------------------
|
||||
|
||||
void short_inv_DFT_size_8_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
||||
void short_inv_DFT_size_8_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||
|
||||
short_inv_DFT_size_8_unscaled( src_stride, tgt_stride, src, tgt );
|
||||
|
||||
@ -269,7 +269,7 @@ const uint64_t DFT16_COS_3U_MINUS_U = 0x0807fff7fff7f800 ;
|
||||
const uint64_t DFT16_J_SIN_3U_MINUS_U = 0x08080007fff80800 ;
|
||||
const uint64_t DFT16_J_SIN_MINUS_3U_MINUS_U = 0x07f800080007f800 ;
|
||||
|
||||
void short_fwd_DFT_size_16( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
||||
void short_fwd_DFT_size_16( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||
|
||||
int src_stride2 = src_stride + src_stride ;
|
||||
int src_stride3 = src_stride2 + src_stride ;
|
||||
@ -408,7 +408,7 @@ const uint64_t IDFT16_COS_3U_MINUS_U = 0x0807fff7fff7f800 ;
|
||||
const uint64_t IDFT16_J_SIN_3U_MINUS_U = 0x08080007fff80800 ;
|
||||
const uint64_t IDFT16_J_SIN_MINUS_3U_MINUS_U = 0x07f800080007f800 ;
|
||||
|
||||
void short_inv_DFT_size_16_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
||||
void short_inv_DFT_size_16_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||
|
||||
int src_stride2 = src_stride + src_stride ;
|
||||
int src_stride3 = src_stride2 + src_stride ;
|
||||
@ -532,7 +532,7 @@ void short_inv_DFT_size_16_unscaled( int src_stride, int tgt_stride, uint64_t *s
|
||||
|
||||
//------------------
|
||||
|
||||
void short_inv_DFT_size_16_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
||||
void short_inv_DFT_size_16_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||
|
||||
short_inv_DFT_size_16_unscaled( src_stride, tgt_stride, src, tgt );
|
||||
|
||||
|
||||
@ -5,23 +5,23 @@
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
void short_fwd_DFT_size_4 ( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_4_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_4_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
void short_fwd_DFT_size_4 ( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_4_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_4_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
|
||||
void short_fwd_DFT_size_8 ( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_8_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_8_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
void short_fwd_DFT_size_8 ( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_8_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_8_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
|
||||
void short_fwd_DFT_size_16 ( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_16_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_16_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
void short_fwd_DFT_size_16 ( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_16_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
void short_inv_DFT_size_16_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// void short_fwd_DFT_size_4_ext ( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
// void short_inv_DFT_size_4_ext_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
// void short_inv_DFT_size_4_ext_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
||||
// void short_fwd_DFT_size_4_ext ( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
// void short_inv_DFT_size_4_ext_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
// void short_inv_DFT_size_4_ext_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user