
#include <math.h>
#include <assert.h>

#include "fxt.h"
#include "fxtdefs.h"  // SUMDIFF, CSQR, CMULT
#include "revbinpermute.h"
#include "sincos.h"


static const ulong RX = 4;
static const ulong LX = 2;


void
dif4_fft_core(double *fr, double *fi, ulong ldn)
// radix 4 decimation in frequency fft
// isign = +1
// output data is in revbin_permuted order
{
    const ulong n = (1<<ldn);

    if ( n<=2 )
    {
        if ( n==2 )
        {
            SUMDIFF2(fr[0], fr[1]);
            SUMDIFF2(fi[0], fi[1]);
        }
        return;
    }

    for (ulong ldm=ldn; ldm>=(LX<<1); ldm-=LX)
    {
        ulong m = (1<<ldm);
        ulong m4 = (m>>LX);

        const double ph0 = 2.0*M_PI/m;

        for (ulong j=0; j<m4; j++)
        {
	    double c,s, c2,s2, c3,s3;
            sincos(j*ph0, &s, &c);
            CSQR4(c, s, c2, s2);
            CMULT6(c, s, c2, s2, c3, s3);

            for (ulong r=0, i0=j+r;  r<n;  r+=m, i0+=m)
            {
		double xr,yr,ur,vr, xi,yi,ui,vi;
                ulong i1 = i0 + m4;
                ulong i2 = i1 + m4;
                ulong i3 = i2 + m4;

                // {x,u} = {f[i0]+f[i2], f[i0]-f[i2]}: 
                SUMDIFF4(fr[i0], fr[i2], xr, ur);
                SUMDIFF4(fi[i0], fi[i2], xi, ui);

                // {y,v} = {f[i1]+f[i3], (f[i1]-f[i3])*(0,is)}: 
                SUMDIFF4(fi[i3], fi[i1], yi, vr);
                SUMDIFF4(fr[i1], fr[i3], yr, vi);

                DIFFSUM3(xr, yr, fr[i0]);
                DIFFSUM3(xi, yi, fi[i0]);

                CMULT6(c2, s2, yr, yi, fr[i1], fi[i1]);

                SUMDIFF4(ur, vr, xr, yr);
                SUMDIFF4(ui, vi, xi, yi);

                CMULT6(c3, s3, yr, yi, fr[i3], fi[i3]);
                CMULT6(c,  s,  xr, xi, fr[i2], fi[i2]);
	    }
        }
    }


    if ( (ldn&1)!=0 )  // n is not a power of 4, need a radix 8 step
    {
        for (ulong i0=0; i0<n; i0+=8)
            fft8_dif_core(fr+i0, fi+i0);
    }
    else
    {
        for (ulong i0=0; i0<n; i0+=4)
        {
	    double xr,yr,ur,vr, xi,yi,ui,vi;
            ulong i1 = i0 + 1;
            ulong i2 = i1 + 1;
            ulong i3 = i2 + 1;

            SUMDIFF4(fr[i0], fr[i2], xr, ur);
            SUMDIFF4(fr[i1], fr[i3], yr, vi);
            SUMDIFF4(fi[i0], fi[i2], xi, ui);
            SUMDIFF4(fi[i3], fi[i1], yi, vr);

            SUMDIFF4(xi, yi, fi[i0], fi[i1]);
            SUMDIFF4(ui, vi, fi[i2], fi[i3]);
            SUMDIFF4(xr, yr, fr[i0], fr[i1]);
            SUMDIFF4(ur, vr, fr[i2], fr[i3]);
        }
    }
}
// ===================== end =======================


void
dif4_fft_core(Complex *f, ulong ldn)
// radix 4 decimation in frequency fft
// isign = +1
// output data is in revbin_permuted order
{
    const ulong n = (1<<ldn);

    if ( n<=2 )
    {
        if ( n==2 )  SUMDIFF2(f[0], f[1]);
        return;
    }

    for (ulong ldm=ldn; ldm>=(LX<<1); ldm-=LX)
    {
        ulong m = (1<<ldm);
        ulong m4 = (m>>LX);

        const double ph0 = 2.0*M_PI/m;

        for (ulong j=0; j<m4; j++)
        {
	    double c,s, c2,s2, c3,s3;
            sincos(j*ph0, &s, &c);
            CSQR4(c, s, c2, s2);
            CMULT6(c, s, c2, s2, c3, s3);

            for (ulong r=0, i0=j+r;  r<n;  r+=m, i0+=m)
            {
		Complex x,y,u,v;
                ulong i1 = i0 + m4;
                ulong i2 = i1 + m4;
                ulong i3 = i2 + m4;

                SUMDIFF4(f[i0], f[i2], x, u);
                SUMDIFF4(f[i1], f[i3], y, v);
                v *= Complex(0,1);

                DIFFSUM3(x, y, f[i0]);
                f[i1] = y * Complex(c2, s2);

                SUMDIFF4(u, v, x, y);
                f[i3] = y * Complex(c3, s3);
                f[i2] = x * Complex(c,  s);
	    }
        }
    }


    if ( (ldn&1)!=0 )  // n is not a power of 4, need a radix 8 step
    {
        for (ulong i0=0; i0<n; i0+=8)
            fft8_dif_core(f+i0);
    }
    else
    {
        for (ulong i0=0; i0<n; i0+=4)
        {
            Complex x,y,u,v;
            ulong i1 = i0 + 1;
            ulong i2 = i1 + 1;
            ulong i3 = i2 + 1;

            SUMDIFF4(f[i0], f[i2], x, u);
            SUMDIFF4(f[i1], f[i3], y, v);
            v *= Complex(0,1);

            SUMDIFF4(x, y, f[i0], f[i1]);
            SUMDIFF4(u, v, f[i2], f[i3]);
        }
    }
}
// ===================== end =======================



void
dif4_fft(double *fr, double *fi, ulong ldn, int is)
// fast fourier transform
// radix 4 decimation in frequency algorithm
{
    if ( is>0 )  dif4_fft_core(fr, fi, ldn);
    else         dif4_fft_core(fi, fr, ldn);

    revbin_permute(fr, 1<<ldn);
    revbin_permute(fi, 1<<ldn);
}
// ===================== end =======================
