#include <math.h>
#include <assert.h>

#include "fxtaux.h"
#include "fxt.h"

#define RX 4
#define LX 2

#define USE_FINAL_RAD8  // recommended


void 
dif4_fft(double *fr, double *fi, ulong ldn, int is)
{ 
    dif4_fft_core(fr,fi,ldn,is);

    const ulong n=(1<<ldn);

    scramble(fr,fi,n);
}
// ===================== end SANDE4_FFT =======================



void 
dif4_fft_core(double *fr, double *fi, ulong ldn, int is)
//
// optimized version of radix 4 fft
//
{
    ulong ldm,m,m4;
    ulong j,r;
    ulong i0,i1,i2,i3;

    const ulong n=(1<<ldn);

    if(n<=2)
    {
        if(n==2)  // data length is 2
        {
            sumdiff2(fr[0],fr[1]);
            sumdiff2(fi[0],fi[1]);
        }

        return;
    }

#if defined USE_FINAL_RAD8
    for(ldm=ldn; ldm>=(LX<<1); ldm-=LX)
#else
    for(ldm=ldn; ldm>=(LX<<(!(ldm&1))); ldm-=LX)
#endif  // defined USE_FINAL_RAD8
    {
        m=(1<<ldm);
        m4=(m>>LX);
        const double ph0=(is>0?2.0*M_PI:-2.0*M_PI)/m;

        for(j=0; j<m4; j++)
        {
	    double c,s,c2,s2,c3,s3;
            sincos(&c,&s,j*ph0);
            csqr4(c,s,c2,s2);
            cmult6(c,s,c2,s2,c3,s3);

            for(r=0, i0=j+r; r<n; r+=m, i0+=m)
            {
		double xr,yr,ur,vr, xi,yi,ui,vi;

                i1=i0+m4;
                i2=i1+m4;
                i3=i2+m4;

                sumdiff4(fr[i0],fr[i2],xr,ur);
                sumdiff4(fi[i0],fi[i2],xi,ui);

		if(is<0)
		{
                    sumdiff4(fi[i1],fi[i3],yi,vr);
                    sumdiff4(fr[i3],fr[i1],yr,vi);
		}
		else
		{
                    sumdiff4(fi[i3],fi[i1],yi,vr);
                    sumdiff4(fr[i1],fr[i3],yr,vi);
        	}

                fr[i0]=(xr+yr);     
                yr-=xr;

                fi[i0]=(xi+yi);  
		yi-=xi;
                
                cmult6(-yr,-yi,c2,s2,fr[i1],fi[i1]);

                sumdiff4(ur,vr,xr,yr);
                sumdiff4(ui,vi,xi,yi);

                cmult6(yr,yi,c3,s3,fr[i3],fi[i3]);
                cmult6(xr,xi,c,s,fr[i2],fi[i2]);
	    }
        }
    }

    //------------- end main loop


    if( (ldn&1)!=0 )  // n is not a power of 4, need a radix 2 step
    {
#if defined USE_FINAL_RAD8

        for(i0=0; i0<n; i0+=8)
	{
             fft8ss(fr+i0,fi+i0,is);
	}
#else

        for(i0=0,i1=1; i0<n; i0+=2, i1+=2)
	{
            sumdiff4(fr[i0],fr[i1]);
            sumdiff4(fi[i0],fi[i1]);
	}
#endif  // defined USE_FINAL_RAD8

    }
    else
    {
        m=(1<<ldm);
        m4=(m>>LX);

        for(i0=0; i0<n; i0+=m)
        {
	    double xr,yr,ur,vr, xi,yi,ui,vi;

            i1=i0+m4;
            i2=i1+m4;
            i3=i2+m4;

            sumdiff4(fr[i0],fr[i2],xr,ur);
            sumdiff4(fi[i0],fi[i2],xi,ui);

            if(is<0)
            {
                sumdiff4(fi[i1],fi[i3],yi,vr);
                sumdiff4(fr[i3],fr[i1],yr,vi);
            }
            else
            {
                sumdiff4(fi[i3],fi[i1],yi,vr);
                sumdiff4(fr[i1],fr[i3],yr,vi);
	    }

            sumdiff4(xr,yr,fr[i0],fr[i1]);
            sumdiff4(xi,yi,fi[i0],fi[i1]);

            sumdiff4(ur,vr,fr[i2],fr[i3]);
            sumdiff4(ui,vi,fi[i2],fi[i3]);
        }
    } 
}
// ===================== end SANDE4_FFT_CORE =======================
