mirror of
https://github.com/logos-storage/outsourcing-Reed-Solomon.git
synced 2026-01-02 13:43:07 +00:00
speed up NTT by another almost 10% by using optimized short DFT routines
This commit is contained in:
parent
2adca16e10
commit
ae3c9827d3
@ -6,25 +6,39 @@
|
|||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
#include "goldilocks.h"
|
#include "goldilocks.h"
|
||||||
|
#include "short_dft.h"
|
||||||
#include "ntt.h"
|
#include "ntt.h"
|
||||||
|
|
||||||
// -----------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
void goldilocks_ntt_forward_noalloc(int m, int src_stride, const uint64_t *gpows, const uint64_t *src, uint64_t *buf, uint64_t *tgt) {
|
void goldilocks_ntt_forward_noalloc(int m, int src_stride, const uint64_t *gpows, const uint64_t *src, uint64_t *buf, uint64_t *tgt) {
|
||||||
|
|
||||||
if (m==0) {
|
switch(m) {
|
||||||
tgt[0] = src[0];
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (m==1) {
|
case 4:
|
||||||
|
short_fwd_DFT_size_16( src_stride, 1, src, tgt );
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
short_fwd_DFT_size_8( src_stride, 1, src, tgt );
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
short_fwd_DFT_size_4( src_stride, 1, src, tgt );
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 1:
|
||||||
// N = 2
|
// N = 2
|
||||||
tgt[0] = goldilocks_add( src[0] , src[src_stride] ); // x + y
|
tgt[0] = goldilocks_add( src[0] , src[src_stride] ); // x + y
|
||||||
tgt[1] = goldilocks_sub( src[0] , src[src_stride] ); // x - y
|
tgt[1] = goldilocks_sub( src[0] , src[src_stride] ); // x - y
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
else {
|
case 0:
|
||||||
|
tgt[0] = src[0];
|
||||||
|
return;
|
||||||
|
|
||||||
|
default: {
|
||||||
|
|
||||||
int N = (1<< m );
|
int N = (1<< m );
|
||||||
int halfN = (1<<(m-1));
|
int halfN = (1<<(m-1));
|
||||||
|
|
||||||
@ -33,14 +47,16 @@ void goldilocks_ntt_forward_noalloc(int m, int src_stride, const uint64_t *gpows
|
|||||||
|
|
||||||
for(int j=0; j<halfN; j++) {
|
for(int j=0; j<halfN; j++) {
|
||||||
const uint64_t gpow = gpows[j*src_stride];
|
const uint64_t gpow = gpows[j*src_stride];
|
||||||
tgt[j ] = goldilocks_mul( buf[j+halfN] , gpow ); // g*v[k]
|
tgt[j ] = goldilocks_mul( buf[j+halfN] , gpow ); // g^k * v[k]
|
||||||
tgt[j+halfN] = goldilocks_neg( tgt[j ] ); // - g*v[k]
|
tgt[j+halfN] = goldilocks_neg( tgt[j ] ); // - g^k * v[k]
|
||||||
tgt[j ] = goldilocks_add( tgt[j ] , buf[j] ); // u[k] + g*v[k]
|
tgt[j ] = goldilocks_add( tgt[j ] , buf[j] ); // u[k] + g^k * v[k]
|
||||||
tgt[j+halfN] = goldilocks_add( tgt[j+halfN] , buf[j] ); // u[k] - g*v[k]
|
tgt[j+halfN] = goldilocks_add( tgt[j+halfN] , buf[j] ); // u[k] - g^k * v[k]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// forward number-theoretical transform (evaluation of a polynomial)
|
// forward number-theoretical transform (evaluation of a polynomial)
|
||||||
// `src` and `tgt` should be `N = 2^m` sized arrays of field elements
|
// `src` and `tgt` should be `N = 2^m` sized arrays of field elements
|
||||||
// `gen` should be the generator of the multiplicative subgroup sized `N`
|
// `gen` should be the generator of the multiplicative subgroup sized `N`
|
||||||
@ -154,20 +170,31 @@ const uint64_t goldilocks_oneHalf = 0x7fffffff80000001ull;
|
|||||||
|
|
||||||
void goldilocks_ntt_inverse_noalloc(int m, int tgt_stride, const uint64_t *gpows, const uint64_t *src, uint64_t *buf, uint64_t *tgt) {
|
void goldilocks_ntt_inverse_noalloc(int m, int tgt_stride, const uint64_t *gpows, const uint64_t *src, uint64_t *buf, uint64_t *tgt) {
|
||||||
|
|
||||||
if (m==0) {
|
switch(m) {
|
||||||
|
|
||||||
|
case 0:
|
||||||
tgt[0] = src[0];
|
tgt[0] = src[0];
|
||||||
return;
|
break;
|
||||||
}
|
|
||||||
|
|
||||||
if (m==1) {
|
|
||||||
// N = 2
|
|
||||||
|
|
||||||
|
case 1:
|
||||||
tgt[0 ] = goldilocks_add( src[0] , src[1] ); // x + y
|
tgt[0 ] = goldilocks_add( src[0] , src[1] ); // x + y
|
||||||
tgt[tgt_stride] = goldilocks_sub( src[0] , src[1] ); // x - y
|
tgt[tgt_stride] = goldilocks_sub( src[0] , src[1] ); // x - y
|
||||||
return;
|
break;
|
||||||
}
|
|
||||||
|
case 2:
|
||||||
|
short_inv_DFT_size_4_unscaled( 1, tgt_stride, src, tgt );
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
short_inv_DFT_size_8_unscaled( 1, tgt_stride, src, tgt );
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
short_inv_DFT_size_16_unscaled( 1, tgt_stride, src, tgt );
|
||||||
|
break;
|
||||||
|
|
||||||
|
default: {
|
||||||
|
|
||||||
else {
|
|
||||||
int N = (1<< m );
|
int N = (1<< m );
|
||||||
int halfN = (1<<(m-1));
|
int halfN = (1<<(m-1));
|
||||||
|
|
||||||
@ -180,6 +207,9 @@ void goldilocks_ntt_inverse_noalloc(int m, int tgt_stride, const uint64_t *gpows
|
|||||||
|
|
||||||
goldilocks_ntt_inverse_noalloc( m-1 , tgt_stride<<1 , gpows , buf , buf + N , tgt );
|
goldilocks_ntt_inverse_noalloc( m-1 , tgt_stride<<1 , gpows , buf , buf + N , tgt );
|
||||||
goldilocks_ntt_inverse_noalloc( m-1 , tgt_stride<<1 , gpows , buf + halfN , buf + N , tgt + tgt_stride );
|
goldilocks_ntt_inverse_noalloc( m-1 , tgt_stride<<1 , gpows , buf + halfN , buf + N , tgt + tgt_stride );
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -30,7 +30,7 @@ const uint64_t IDFT4_INV_OMEGA = 0xfffeffff00000001 ;
|
|||||||
const uint64_t IDFT4_J = 0x0001000000000000 ;
|
const uint64_t IDFT4_J = 0x0001000000000000 ;
|
||||||
const uint64_t IDFT4_INV_4 = 0xbfffffff40000001 ;
|
const uint64_t IDFT4_INV_4 = 0xbfffffff40000001 ;
|
||||||
|
|
||||||
void short_inv_DFT_size_4_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
void short_inv_DFT_size_4_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||||
|
|
||||||
int src_stride2 = src_stride + src_stride ;
|
int src_stride2 = src_stride + src_stride ;
|
||||||
int src_stride3 = src_stride2 + src_stride ;
|
int src_stride3 = src_stride2 + src_stride ;
|
||||||
@ -57,7 +57,7 @@ void short_inv_DFT_size_4_unscaled( int src_stride, int tgt_stride, uint64_t *sr
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void short_fwd_DFT_size_4( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
void short_fwd_DFT_size_4( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||||
short_inv_DFT_size_4_unscaled( src_stride, tgt_stride, src, tgt );
|
short_inv_DFT_size_4_unscaled( src_stride, tgt_stride, src, tgt );
|
||||||
|
|
||||||
int tgt_stride3 = 3*tgt_stride;
|
int tgt_stride3 = 3*tgt_stride;
|
||||||
@ -67,7 +67,7 @@ void short_fwd_DFT_size_4( int src_stride, int tgt_stride, uint64_t *src, uint64
|
|||||||
tgt[tgt_stride3] = tmp;
|
tgt[tgt_stride3] = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
void short_inv_DFT_size_4_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
void short_inv_DFT_size_4_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||||
|
|
||||||
short_inv_DFT_size_4_unscaled( src_stride, tgt_stride, src, tgt );
|
short_inv_DFT_size_4_unscaled( src_stride, tgt_stride, src, tgt );
|
||||||
|
|
||||||
@ -89,7 +89,7 @@ const uint64_t DFT8_J = 0x0001000000000000 ;
|
|||||||
const uint64_t DFT8_COS_U = 0xffffff7f00800081 ;
|
const uint64_t DFT8_COS_U = 0xffffff7f00800081 ;
|
||||||
const uint64_t DFT8_MINUS_J_SIN_U = 0xffffff7eff800081 ;
|
const uint64_t DFT8_MINUS_J_SIN_U = 0xffffff7eff800081 ;
|
||||||
|
|
||||||
void short_fwd_DFT_size_8( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
void short_fwd_DFT_size_8( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||||
// u = 2pi/8
|
// u = 2pi/8
|
||||||
// omega = cos(u) + i*sin(u)
|
// omega = cos(u) + i*sin(u)
|
||||||
//
|
//
|
||||||
@ -164,7 +164,7 @@ const uint64_t IDFT8_INV_8 = 0xdfffffff20000001 ;
|
|||||||
|
|
||||||
//--------------------------------------
|
//--------------------------------------
|
||||||
|
|
||||||
void short_inv_DFT_size_8_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
void short_inv_DFT_size_8_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||||
// u = 2pi/8
|
// u = 2pi/8
|
||||||
// omega = cos(u) + i*sin(u)
|
// omega = cos(u) + i*sin(u)
|
||||||
//
|
//
|
||||||
@ -231,7 +231,7 @@ void short_inv_DFT_size_8_unscaled( int src_stride, int tgt_stride, uint64_t *sr
|
|||||||
|
|
||||||
//------------------
|
//------------------
|
||||||
|
|
||||||
void short_inv_DFT_size_8_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
void short_inv_DFT_size_8_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||||
|
|
||||||
short_inv_DFT_size_8_unscaled( src_stride, tgt_stride, src, tgt );
|
short_inv_DFT_size_8_unscaled( src_stride, tgt_stride, src, tgt );
|
||||||
|
|
||||||
@ -269,7 +269,7 @@ const uint64_t DFT16_COS_3U_MINUS_U = 0x0807fff7fff7f800 ;
|
|||||||
const uint64_t DFT16_J_SIN_3U_MINUS_U = 0x08080007fff80800 ;
|
const uint64_t DFT16_J_SIN_3U_MINUS_U = 0x08080007fff80800 ;
|
||||||
const uint64_t DFT16_J_SIN_MINUS_3U_MINUS_U = 0x07f800080007f800 ;
|
const uint64_t DFT16_J_SIN_MINUS_3U_MINUS_U = 0x07f800080007f800 ;
|
||||||
|
|
||||||
void short_fwd_DFT_size_16( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
void short_fwd_DFT_size_16( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||||
|
|
||||||
int src_stride2 = src_stride + src_stride ;
|
int src_stride2 = src_stride + src_stride ;
|
||||||
int src_stride3 = src_stride2 + src_stride ;
|
int src_stride3 = src_stride2 + src_stride ;
|
||||||
@ -408,7 +408,7 @@ const uint64_t IDFT16_COS_3U_MINUS_U = 0x0807fff7fff7f800 ;
|
|||||||
const uint64_t IDFT16_J_SIN_3U_MINUS_U = 0x08080007fff80800 ;
|
const uint64_t IDFT16_J_SIN_3U_MINUS_U = 0x08080007fff80800 ;
|
||||||
const uint64_t IDFT16_J_SIN_MINUS_3U_MINUS_U = 0x07f800080007f800 ;
|
const uint64_t IDFT16_J_SIN_MINUS_3U_MINUS_U = 0x07f800080007f800 ;
|
||||||
|
|
||||||
void short_inv_DFT_size_16_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
void short_inv_DFT_size_16_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||||
|
|
||||||
int src_stride2 = src_stride + src_stride ;
|
int src_stride2 = src_stride + src_stride ;
|
||||||
int src_stride3 = src_stride2 + src_stride ;
|
int src_stride3 = src_stride2 + src_stride ;
|
||||||
@ -532,7 +532,7 @@ void short_inv_DFT_size_16_unscaled( int src_stride, int tgt_stride, uint64_t *s
|
|||||||
|
|
||||||
//------------------
|
//------------------
|
||||||
|
|
||||||
void short_inv_DFT_size_16_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt ) {
|
void short_inv_DFT_size_16_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt ) {
|
||||||
|
|
||||||
short_inv_DFT_size_16_unscaled( src_stride, tgt_stride, src, tgt );
|
short_inv_DFT_size_16_unscaled( src_stride, tgt_stride, src, tgt );
|
||||||
|
|
||||||
|
|||||||
@ -5,23 +5,23 @@
|
|||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
void short_fwd_DFT_size_4 ( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
void short_fwd_DFT_size_4 ( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
void short_inv_DFT_size_4_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
void short_inv_DFT_size_4_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
void short_inv_DFT_size_4_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
void short_inv_DFT_size_4_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
|
|
||||||
void short_fwd_DFT_size_8 ( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
void short_fwd_DFT_size_8 ( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
void short_inv_DFT_size_8_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
void short_inv_DFT_size_8_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
void short_inv_DFT_size_8_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
void short_inv_DFT_size_8_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
|
|
||||||
void short_fwd_DFT_size_16 ( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
void short_fwd_DFT_size_16 ( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
void short_inv_DFT_size_16_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
void short_inv_DFT_size_16_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
void short_inv_DFT_size_16_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
void short_inv_DFT_size_16_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
// void short_fwd_DFT_size_4_ext ( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
// void short_fwd_DFT_size_4_ext ( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
// void short_inv_DFT_size_4_ext_unscaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
// void short_inv_DFT_size_4_ext_unscaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
// void short_inv_DFT_size_4_ext_rescaled( int src_stride, int tgt_stride, uint64_t *src, uint64_t *tgt );
|
// void short_inv_DFT_size_4_ext_rescaled( int src_stride, int tgt_stride, const uint64_t *src, uint64_t *tgt );
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user