#include <math.h>
#include <assert.h>

#include "fxtaux.h"
#include "fxt.h"

#define RX 4
#define LX 2

#define USE_INITIAL_RAD8 // recommended


void 
dit4_fft(double *fr, double *fi, ulong ldn, int is)
//
// optimized version of radix 4 fft
//
{
    const ulong n=(1<<ldn);

    scramble(fr,fi,n);

    dit4_fft_core(fr,fi,ldn,is);
}
// ==================== end COOLEY4_FFT =================



void 
dit4_fft_core(double *fr, double *fi, ulong ldn, int is)
{
    ulong ldm,m,m4;
    ulong j,r;
    ulong i0,i1,i2,i3;

    const ulong n=(1<<ldn);

    if(n<=2)  // data length is 2
    {
        if(n==2)
        {
            sumdiff2(fr[0],fr[1]);
            sumdiff2(fi[0],fi[1]);
        }

        return;
    }


    ldm=(ldn&1);
    if( ldm!=0 )  // n is not a power of 4, need a radix 2 step
    {
#if defined USE_INITIAL_RAD8

        for(i0=0; i0<n; i0+=8)
	{
             fft8cc(fr+i0,fi+i0,is);
	}

        ldm+=LX;

#else

        for(i0=0,i1=1; i0<n; i0+=2, i1+=2)
	{
            sumdiff4(fr[i0],fr[i1]);
            sumdiff4(fi[i0],fi[i1]);
	}
#endif  // defined USE_INITIAL_RAD8

    }

    ldm+=LX;

    if(ldm==LX)
    {
        m=(ldm<<1);
        m4=(m>>LX);

        for(i0=0; i0<n; i0+=m)
        {
	    double xr,yr,ur,vr, xi,yi,ui,vi;
    
            i1=i0+m4;
            i2=i1+m4;
            i3=i2+m4;

            sumdiff4(fr[i0],fr[i1],xr,ur);
            sumdiff4(fi[i0],fi[i1],xi,ui);
 
            if(is<0)
            {
                sumdiff4(fr[i3],fr[i2],yr,vi);
                sumdiff4(fi[i2],fi[i3],yi,vr);
	    }
            else
            {
                sumdiff4(fr[i2],fr[i3],yr,vi);
                sumdiff4(fi[i3],fi[i2],yi,vr);
	    }

            sumdiff4(ur,vr,fr[i1],fr[i3]);
            sumdiff4(ui,vi,fi[i1],fi[i3]);
            sumdiff4(xr,yr,fr[i0],fr[i2]);
            sumdiff4(xi,yi,fi[i0],fi[i2]);
	}

        ldm+=LX;
    }


    //------------- main loop :

    for( ; ldm<=ldn ; ldm+=LX)
    {
        m=(1<<ldm);
        m4=(m>>LX);
        const double ph0=(is>0?2.0*M_PI:-2.0*M_PI)/m;

        for(j=0; j<m4; j++)
        {
	    double c,s,c2,s2,c3,s3;
            sincos(&c,&s,j*ph0);
            csqr4(c,s,c2,s2);
            cmult6(c,s,c2,s2,c3,s3);

            for(r=0, i0=j+r; r<n; r+=m, i0+=m)
            {
		double a1r,a1i;
		double xr,yr,ur,vr, xi,yi,ui,vi;

                i1=i0+m4;
                i2=i1+m4;
                i3=i2+m4;

                cmult6(fr[i1],fi[i1],c2,s2,a1r,a1i); 

                sumdiff4(fr[i0],a1r,xr,ur); 
                sumdiff4(fi[i0],a1i,xi,ui); 

                cmult6(fr[i2],fi[i2],c,s,a1r,a1i);

		double a3r,a3i;
		cmult6(fr[i3],fi[i3],c3,s3,a3r,a3i);
    
		if(is<0)
		{
		    sumdiff4(a3r,a1r,yr,vi);
		    sumdiff4(a1i,a3i,yi,vr);
		}
		else
		{
		    sumdiff4(a1r,a3r,yr,vi);
		    sumdiff4(a3i,a1i,yi,vr);
		}

                sumdiff4(ur,vr,fr[i1],fr[i3]);
                sumdiff4(ui,vi,fi[i1],fi[i3]);
                sumdiff4(xr,yr,fr[i0],fr[i2]);
                sumdiff4(xi,yi,fi[i0],fi[i2]);
	    }
        }
    }
}
// ==================== end COOLEY4_FFT_CORE =================
