#include <math.h>
#include <assert.h>
#include <iostream.h>

#include "fxt.h"
#include "fxtaux.h"
#include "auxk.h"


#define FFT(a,b,c,d)  fht_fft(a,b,c,d)
#define FFT0(a,b,c,d) fht_fft0(a,b,c,d)

#if 0
#define kd_copy1(a,b,c,d) kd_debug(a,b,c,d,wramr,wramsz,n)
#define dk_copy1(a,b,c,d) dk_debug(a,b,c,d,wramr,wramsz,n)
#define kd_copy2(a,b,c,d) kd_debug(a,b,c,d,wrami,wramsz,n)
#define dk_copy2(a,b,c,d) dk_debug(a,b,c,d,wrami,wramsz,n)
#else
#define kd_copy1(a,b,c,d) kd_copy(a,b,c,d)
#define dk_copy1(a,b,c,d) dk_copy(a,b,c,d)
#define kd_copy2(a,b,c,d) kd_copy(a,b,c,d)
#define dk_copy2(a,b,c,d) dk_copy(a,b,c,d)
#endif


void 
complex_sqr(double *r, double *i, long n)
{
    while ( n-- )  
    { 
	double t=i[n];
	i[n] *= 2.0*r[n];
	r[n] *= r[n];
	r[n] -= t*t; 
    }
}

#define USE_MAX_ROW_LEN  // improve disk access 

#define ROW (k1)
#define COL (k2+n2w*wct)

#define ROW2 (k1)
#define COL2 (k2+n2w*wct)

#define is (+1)  // use +1


#define wramsz 16384
double  wramr[wramsz];
double  wrami[wramsz];

#define FILE0  "/tmp/tmpfile0.tmp"
#define FILE1  "/tmp/tmpfile1.tmp"

//
// XXX CONSTRUCTION AREA !
//

FILE* 
mass_storage_convolution(double *ai, long an, double *bi, long bn, long &n)
//
// n is output 
//
{
    int    ldnx;
    double *pa,*pb;
    
    // --- data on disk:
    FILE   *rfile=NULL, *ifile=NULL;    // the big files
    //long n;                           // total # of complex vals  =n1*n2
    long   n1,n2;                       // n1=len of rows (or #of cols), 
    int    ldn,ldn1,ldn2;
    size_t pf;
    
    // --- the workspace:
    long   wwsz;                     // size (in complex) of ...
    double *wwr=NULL,*wwi=NULL;      //   RAM workspace
    long   wct;
    double *pwr=NULL, *pwi=NULL; 
    
    // --- the buffer:
    long   bufsz;
    double *bufr,*bufi;
    
    long   v;       
    long   n1w,n2w; 
    long   k1,k2;
    double ph0;

    ldnx=ld(an);
    assert( an==(((long)1)<<ldnx) );
    assert( an==bn );

    ldn=ldnx+1;     // factor 2 for zero padding 
    n=(((long)1)<<ldn); 


    // --- workspace:
    //    wwsz=hfg_wsdbls/2;  // re+im
    //    assert(wwsz<n);     // else this routine makes no sense
    //
    //    wwr=(double *)gws.get_ws1();
    //    wwi=(double *)gws.get_ws2();
    wwr=(double *)....
    wwi=(double *)....

    v=n/wwsz;  // ratio  (total data)/(size of workspace)

    // --- n1,n2:
    ldn1=(ldn>>1);
    ldn2=ldn-ldn1;


#if defined USE_MAX_ROW_LEN  
    int ldmaxrowlen=ld(wwsz)-1; // fails if set to ld(wwsz), makes no sense anyway

    while(ldn2<ldmaxrowlen) // one complete row must fit into workspace
    {
	ldn1--;
	ldn2++;
    }
#endif  // defined USE_MAX_ROW_LEN


    n1=(1<<ldn1);
    n2=(1<<ldn2);  

    assert( n2>=n1 ); 

    n2w=n2/v;     // how many cols fit into workspace
    n1w=n1/v;     // how many rows fit into workspace

    assert(wwsz==n1w*n2);
    assert(wwsz==n2w*n1);

    // --- buffer: (workspace for the transformations over columns)
    bufsz=n1;              
    bufr=new double[bufsz]; assert(bufr);
    bufi=new double[bufsz]; assert(bufi);

    assert(bufsz<=wwsz);
    assert(wwsz<=n);
    assert(wwsz>=n2);


    // --- data on disk:
    rfile=fopen(FILE0,"w+"); assert(rfile);          
    ifile=fopen(FILE1,"w+"); assert(ifile);          


    ph0=is*2.0*M_PI/n;
    for(pf=0,wct=0; wct<v; ++wct,pf+=wwsz) // ---------- fft and exp cols 
    {
        // fill workspace  (get n2w cols from data): 
        // must fail for n1==1 !
        // condition is <n1/2 for zero padded data (else <n1):
	for(pa=ai+wct*n2w,pb=bi+wct*n2w,pwr=wwr,pwi=wwi,k1=0; 
            k1<n1/2; 
            ++k1,pa+=n2,pb+=n2,pwr+=n2w,pwi+=n2w)
	{
	    d_copy(pa,pwr,n2w);
	    d_copy(pb,pwi,n2w);
	}

        // fill workspace: zero pad 
	d_null(wwr+wwsz/2,wwsz/2);
	d_null(wwi+wwsz/2,wwsz/2);  

        // loop thru cols in workspace:
        for(k2=0; k2<n2w; ++k2)
	{
            for(pwr=wwr+k2,pwi=wwi+k2,k1=0; k1<n1; ++k1,pwr+=n2w,pwi+=n2w)
            {
                bufr[k1]=*pwr;
                bufi[k1]=*pwi;
            } 

            FFT0(bufr,bufi,ldn1,is);  

            for(pwr=wwr+k2,pwi=wwi+k2,k1=0; k1<n1; ++k1,pwr+=n2w,pwi+=n2w)  
            {
                double c,s,phi;
                phi=ph0*ROW*COL;
                sincos(&c,&s,phi);
                cmult6(bufr[k1],bufi[k1],c,s,*pwr,*pwi);
            } 
	}

	for(pf=wct*n2w,pwr=wwr,k1=0; k1<n1; ++k1,pf+=n2,pwr+=n2w)  
	{
            dk_copy1(rfile,pwr,pf,n2w);
	}

	for(pf=wct*n2w,pwi=wwi,k1=0; k1<n1; ++k1,pf+=n2,pwi+=n2w)
	{
            dk_copy2(ifile,pwi,pf,n2w);
	}
    }

    // ---------- fft rows:
    for(pf=0,wct=0; wct<v; ++wct,pf+=wwsz)
    {
        kd_copy1(rfile,pf,wwr,wwsz);
        kd_copy2(ifile,pf,wwi,wwsz);  
         
        // loop thru rows:
        for(pwr=wwr,pwi=wwi,k1=0; k1<n1w; ++k1,pwr+=n2,pwi+=n2) 
        {
            assert( (pwr-wwr)<wwsz );
            FFT(pwr,pwi,ldn2,is);
            complex_sqr(pwr,pwi,n2); // for convolution

/* ************** begin shortcut ****************** (saves one pass thru data) 
 *        }
 *
 *        dk_copy1(rfile,wwr,pf,wwsz);
 *        dk_copy2(ifile,wwi,pf,wwsz);  
 *    }
 *
 *    // if the *** shortcut is not made (and the data not squared)
 *    // we got the fourier transformed (zero padded) data here in a _transposed_ form
 *    // (in the files) 
 *
 *    //-------- PART 2: mass storage fourier back ---------------------------
 *    for(pf=0,wct=0; wct<v; ++wct,pf+=wwsz) // ---------- fft 'rows' in c
 *    {
 *        kd_copy1(rfile,pf,wwr,wwsz);  
 *        kd_copy2(ifile,pf,wwi,wwsz);  
 *         
 *        // loop thru rows:
 *        for(pwr=wwr,pwi=wwi,k1=0; k1<n1w; ++k1,pwr+=n2,pwi+=n2)
 *        {
 *           assert( (pwr-wwr)<wwsz );
 *           assert( (pwr-wwr+n2)<wwsz );
 *
 ******************** end shortcut *********************** */

            FFT(pwr,pwi,ldn2,-is);
        }

        dk_copy1(rfile,wwr,pf,wwsz);   // DISK write
        dk_copy2(ifile,wwi,pf,wwsz);
    }

    ph0=-is*2.0*M_PI/n;
    for(pf=0,wct=0; wct<v; ++wct,pf+=wwsz) // ---------- exp and fft 'cols' 
    {
        // fill workspace from DISK (get n2w cols from data):
	for(pf=wct*n2w,pwr=wwr,k1=0; k1<n1; ++k1,pf+=n2,pwr+=n2w)
	{
            kd_copy1(rfile,pf,pwr,n2w);
	}
	
	for(pf=wct*n2w,pwi=wwi,k1=0; k1<n1; ++k1,pf+=n2,pwi+=n2w)
	{
            kd_copy2(ifile,pf,pwi,n2w);
	}
         
        // loop thru cols in workspace:
        for(k2=0; k2<n2w; ++k2)
	{
            for(pwr=wwr+k2,pwi=wwi+k2,k1=0; k1<n1; ++k1,pwr+=n2w,pwi+=n2w)  
            {
                bufr[k1]=*pwr;
                bufi[k1]=*pwi;
            } 

            for(k1=0; k1<n1; ++k1)
            {
                double c,s,phi;
                phi=ph0*ROW2*COL2;
                sincos(&c,&s,phi);
                cmult4(c,s,bufr[k1],bufi[k1]);
            } 

            FFT(bufr,bufi,ldn1,-is);  

            for(pwi=wwi+k2,k1=0; k1<n1; ++k1,pwi+=n2w)
            {
                *pwi=bufi[k1];
            } 
	}
 
        d_multiply(wwi,wwsz,1.0/(n+n));

	for(pf=wct*n2w,pwi=wwi,k1=0; k1<n1; ++k1,pf+=n2,pwi+=n2w)
	{
            dk_copy2(ifile,pwi,pf,n2w);
	}
    }

    // the convolved data is now in ifile (in normal order):
    //    gws.let_ws1();
    //    gws.let_ws2();

    fclose(rfile);

    return ifile;
} 
/*===================== end MASS_STORAGE_CONVOLUTION =========================*/
