#include <math.h>
#include <assert.h>

#include "fxtaux.h"

#define RX 4
#define LX 2

// for debug:
//#include <iostream.h>
#define PR(x)  

#define USE_FINAL_RAD8  // recommended


void 
dif4_fft(double *fr, double *fi, int ldn, int is)
//
// optimized version of radix 4 fft
//
{
    long n,m,m2,mr;
    int j,r;
    int i0,i1,i2,i3;
    double ph0,c,s;
    double c2,s2,c3,s3;

    PR( cout<<"\n (re-ver) dif4_fft(): "<<flush; )

    n=(1<<ldn);

    if(n<=2)
    {
        if(n==2)  // data length is 2
        {
            double t;
            t=fr[0]; fr[0]+=fr[1]; fr[1]=t-fr[1];
            t=fi[0]; fi[0]+=fi[1]; fi[1]=t-fi[1];
        }

        return;
    }

#if defined USE_FINAL_RAD8
    for(m=ldn; m>=(LX<<1); m-=LX)
#else
    for(m=ldn; m>=(LX<<(!(m&1))); m-=LX)
#endif  // defined USE_FINAL_RAD8
    {
        PR( cout<<"\n m="<<m<< flush; )

        m2=(1<<m);
        mr=(m2>>LX);
        ph0=(is>0?2.0*M_PI:-2.0*M_PI)/m2;

        for(j=0; j<mr; j++)
        {
	    PR( cout<<"\n    j="<<j<< flush; )

            sincos(&c,&s,j*ph0);

            c2=c*c-s*s;
            s2=2.0*c*s;

            c3=c2*c-s2*s;
            s3=c2*s+s2*c;

	    PR( cout<<"\n   loop:    "<< flush; )
            for(r=0, i0=j+r; r<n; r+=m2, i0+=m2)
            {
	        double t;
		double xr,yr,ur,vr, xi,yi,ui,vi;

                i1=i0+mr;
                i2=i1+mr;
                i3=i2+mr;

		PR( cout<<"\n           i0="<<i0<<"  i1="<<i1<<"  i2="<<i2<<"  i3="<<i3<<flush; )

                xr=((ur=fr[i0])+(t=fr[i2])); 
                ur-=t; 

                xi=((ui=fi[i0])+(t=fi[i2])); 
		ui-=t;

		if(is<0)
		{
		    yi=((vr=fi[i1])+(t=fi[i3])); 
		    vr-=t;
    
		    yr=((t=fr[i1])+(vi=fr[i3])); 
		    vi-=t;
		}
		else
		{
		    yi=((t=fi[i1])+(vr=fi[i3])); 
		    vr-=t;
    
		    yr=((vi=fr[i1])+(t=fr[i3])); 
		    vi-=t;
        	}

                fr[i0]=(xr+yr);     
                yr-=xr;

                fi[i0]=(xi+yi);  
		yi-=xi;
                
		fr[i1]=  yi*s2-yr*c2; 
                fi[i1]=-(yr*s2+yi*c2);


                xr=(ur+vr);  
                yr=(ur-vr); 

                xi=(ui+vi);  
		yi=(ui-vi);  

		fr[i3]=yr*c3-yi*s3; 
                fi[i3]=yr*s3+yi*c3;

		fr[i2]=xr*c -xi*s;  
                fi[i2]=xr*s +xi*c;
	    }
        }
    }


    if( (ldn&1)!=0 )  // n is not a power of 4, need a radix 2 step
    {
#define i0 r

#if defined USE_FINAL_RAD8

        PR( cout<<"\n  final 8:   "<< flush; )
        for(r=0; r<n; r+=8)
	{
	    PR( cout<<"\n           i0="<<i0<<"  (i1,...,i7) "<<flush; )
	    fft8ss(fr+r,fi+r,is);
	}
#else

	PR( cout<<"\n  final 2:    "<< flush; )
        for(r=0,i1=1; r<n; r+=2, i1+=2)
	{
            double t;
	    PR( cout<<"\n           i0="<<i0<<"  i1="<<i1<<flush; )

            t=fr[i0]-fr[i1]; fr[i0]+=fr[i1]; fr[i1]=t;
            t=fi[i0]-fi[i1]; fi[i0]+=fi[i1]; fi[i1]=t;
	}
#endif  // defined USE_FINAL_RAD8

#undef i0
    }
    else
    {

        m2=(1<<m);
        mr=(m2>>LX);
#define i0 r
	PR( cout<<"\n   final 4:   "<< flush; )

        for(r=0; r<n; r+=m2)
        {
	    double t;
	    double xr,yr,ur,vr, xi,yi,ui,vi;

            i1=i0+mr;
            i2=i1+mr;
            i3=i2+mr;

	    PR( cout<<"\n           i0="<<i0<<"  i1="<<i1<<"  i2="<<i2<<"  i3="<<i3<<flush; )

            xr=((ur=fr[i0])+(t=fr[i2])); 
            ur-=t; 

            xi=((ui=fi[i0])+(t=fi[i2])); 
	    ui-=t;


            if(is<0)
            {
                yi=((vr=fi[i1])+(t=fi[i3])); 
                vr-=t;

                yr=((t=fr[i1])+(vi=fr[i3])); 
                vi-=t;
            }
            else
            {
                yi=((t=fi[i1])+(vr=fi[i3])); 
                vr-=t;

                yr=((vi=fr[i1])+(t=fr[i3])); 
                vi-=t;
	    }

            fr[i0]=(xr+yr);     
            fr[i1]=(xr-yr);

            fi[i0]=(xi+yi);  
	    fi[i1]=(xi-yi);

	    fr[i2]=(ur+vr);  
	    fr[i3]=(ur-vr); 

            fi[i2]=(ui+vi);  
            fi[i3]=(ui-vi);  
        }
#undef i0
    } 

    scramble(fr,fi,n);

}
