
#include <math.h>
#include <assert.h>

#include "fxt.h"
#include "fxtdefs.h"  // SUMDIFF, CSQR, CMULT
#include "revbinpermute.h"
#include "sincos.h"


static const ulong RX = 4;
static const ulong LX = 2;


void
dit4_fft_core(double *fr, double *fi, ulong ldn)
// radix 4 decimation in frequency fft
// isign = +1
// input data must be in revbin_permuted order
{
    const ulong n = (1<<ldn);

    if ( n<=2 )
    {
        if ( n==2 )
        {
            SUMDIFF2(fr[0], fr[1]);
            SUMDIFF2(fi[0], fi[1]);
        }
        return;
    }


    ulong ldm = (ldn&1);
    if ( ldm!=0 )  // n is not a power of 4, need a radix 8 step
    {
        for (ulong i0=0; i0<n; i0+=8)  fft8_dit_core(fr+i0, fi+i0);
    }
    else
    {
        for (ulong i0=0; i0<n; i0+=4)
        {
	    double xr,yr,ur,vr, xi,yi,ui,vi;
            ulong i1 = i0 + 1;
            ulong i2 = i1 + 1;
            ulong i3 = i2 + 1;

            SUMDIFF4(fr[i0], fr[i1], xr, ur);
            SUMDIFF4(fr[i2], fr[i3], yr, vi);
            SUMDIFF4(fi[i0], fi[i1], xi, ui);
            SUMDIFF4(fi[i3], fi[i2], yi, vr);

            SUMDIFF4(ui, vi, fi[i1], fi[i3]);
            SUMDIFF4(xi, yi, fi[i0], fi[i2]);
            SUMDIFF4(ur, vr, fr[i1], fr[i3]);
            SUMDIFF4(xr, yr, fr[i0], fr[i2]);
	}
    }
    ldm += 2*LX;


    for ( ; ldm<=ldn; ldm+=LX)
    {
        ulong m = (1<<ldm);
        ulong m4 = (m>>LX);
        const double ph0 = 2.0*M_PI/m;

        for (ulong j=0; j<m4; j++)
        {
	    double c,s,c2,s2,c3,s3;
            sincos(j*ph0, &s, &c);
            CSQR4(c, s, c2, s2);
            CMULT6(c, s, c2, s2, c3, s3);

            for (ulong r=0, i0=j+r;  r<n;  r+=m, i0+=m)
            {
		double xr,yr,ur,vr, xi,yi,ui,vi;
                ulong i1 = i0 + m4;
                ulong i2 = i1 + m4;
                ulong i3 = i2 + m4;

                CMULT6(c2, s2, fr[i1], fi[i1], xr, xi);

                SUMDIFF3R(xr, fr[i0], ur);
                SUMDIFF3R(xi, fi[i0], ui);

                CMULT6(c,  s,  fr[i2], fi[i2], yr, vr);
		CMULT6(c3, s3, fr[i3], fi[i3], vi, yi);

                SUMDIFF2(yr, vi);
                SUMDIFF2(yi, vr);

                SUMDIFF4(ur, vr, fr[i1], fr[i3]);
                SUMDIFF4(ui, vi, fi[i1], fi[i3]);
                SUMDIFF4(xr, yr, fr[i0], fr[i2]);
                SUMDIFF4(xi, yi, fi[i0], fi[i2]);
	    }
        }
    }
}
// ==================== end =================



void
dit4_fft_core(Complex *f, ulong ldn)
// radix 4 decimation in frequency fft
// isign = -1
// input data must be in revbin_permuted order
{
    const ulong n = (1<<ldn);

    if ( n<=2 )
    {
        if ( n==2 )  SUMDIFF2(f[0], f[1]);
        return;
    }


    ulong ldm = (ldn&1);
    if ( ldm!=0 )  // n is not a power of 4, need a radix 8 step
    {
        for (ulong i0=0; i0<n; i0+=8)  fft8_dit_core(f+i0);
    }
    else
    {
        for (ulong i0=0; i0<n; i0+=4)
        {
	    Complex x,y,u,v;
            ulong i1 = i0 + 1;
            ulong i2 = i1 + 1;
            ulong i3 = i2 + 1;

            SUMDIFF4(f[i0], f[i1], x, u);
            SUMDIFF4(f[i2], f[i3], y, v);
            v *= Complex(0, -1);  // isign

            SUMDIFF4(u, v, f[i1], f[i3]);
            SUMDIFF4(x, y, f[i0], f[i2]);
	}
    }
    ldm += 2*LX;


    for ( ; ldm<=ldn; ldm+=LX)
    {
        ulong m = (1<<ldm);
        ulong m4 = (m>>LX);
        const double ph0 = -2.0*M_PI/m;  // isign

        for (ulong j=0; j<m4; j++)
        {
	    double c,s,c2,s2,c3,s3;
            sincos(j*ph0, &s, &c);
            CSQR4(c, s, c2, s2);
            CMULT6(c, s, c2, s2, c3, s3);

            for (ulong r=0, i0=j+r;  r<n;  r+=m, i0+=m)
            {
                ulong i1 = i0 + m4;
                ulong i2 = i1 + m4;
                ulong i3 = i2 + m4;

                Complex x = f[i1] * Complex(c2, s2);
                Complex u;
                SUMDIFF3R(x, f[i0], u);

                Complex v = f[i3] * Complex(c3,  s3);
                Complex y = f[i2] * Complex(c,  s);
                SUMDIFF2(y, v);
                v *= Complex(0, -1);  // isign

                SUMDIFF4(u, v, f[i1], f[i3]);
                SUMDIFF4(x, y, f[i0], f[i2]);
	    }
        }
    }
}
// ==================== end =================


void
dit4_fft(double *fr, double *fi, ulong ldn, int is)
// fast fourier transform
// radix 4 decimation in time algorithm
{
    revbin_permute(fr, 1<<ldn);
    revbin_permute(fi, 1<<ldn);

    if ( is>0 )  dit4_fft_core(fr, fi, ldn);
    else         dit4_fft_core(fi, fr, ldn);
}
// ==================== end =================
