
#include "fxtaux.h"
#include "aux4step.h"


//
// The (four step) algorithm is, compared to the 'usual' FFTs, 
// much more memory local. For systems with hierarchical 
// memory ('cache') this may boost the performance.
// (e.g. systems with very high CPU clock)   
//


void 
four_step_complex_auto_convolution(double *fr, double *fi, ulong ldn, int zp)
{
    ulong n1,n2;
    ulong ldn1,ldn2;

    ldn1=(ldn>>1);
    ldn2=ldn-ldn1;

    n1=(1<<ldn1);
    n2=(1<<ldn2);  // n2>=n1 

    const ulong n = n1*n2;
    const int is = 1;

    mat_col_fft(fr,fi,n1,n2,is,zp); 
    mat_exp(fr,fi,n1,n2,is);
    mat_row_fft(fr,fi,n1,n2,is,0); 

    for(ulong i=0; i<n; ++i)  csqr2(fr[i],fi[i]);

    mat_row_fft(fr,fi,n1,n2,-is,0); 
    mat_exp(fr,fi,n1,n2,-is);
    mat_col_fft(fr,fi,n1,n2,-is,0); 

    d_multiply(fr,n,1.0/n);
    d_multiply(fi,n,1.0/n);
}
// =========== end FOUR_STEP_COMPLEX_AUTO_CONVOLUTION ============



void 
four_step_convolution(double *f, double *g, ulong ldn, int zp)
{
#define fr f
#define fi g
    
    ulong n1,n2;
    ulong ldn1,ldn2;

    ldn1=(ldn>>1);
    ldn2=ldn-ldn1;

    n1=(1<<ldn1);
    n2=(1<<ldn2);  // n2>=n1 

    const ulong n = n1*n2;
    const int is = 1;

    mat_col_fft(fr,fi,n1,n2,is,zp); 
    mat_exp(fr,fi,n1,n2,is);
    mat_row_fft(fr,fi,n1,n2,is,0); 

    for(ulong i=0; i<n; ++i)  csqr2(fr[i],fi[i]);

    mat_row_fft(fr,fi,n1,n2,-is,0); 
    mat_exp(fr,fi,n1,n2,-is);
    mat_col_fft(fr,fi,n1,n2,-is,0); 

    //    d_multiply(fr,n,1.0/n);
    d_multiply(fi,n,1.0/(2*n));
}
// =============== end FOUR_STEP_CONVOLUTION ====================

