#include <math.h>
#include <assert.h>
//#include <stdio.h>
#include <iostream.h>

#include "hfloatfu.h"
#include "../fxt/fxt.h"
#include "../fxt/fxtaux.h"
//#include "mybuiltin.h"
#include "auxid.h"
#include "auxk.h"
#include "workspc.h"


#define PR(x)
#define PRX(x)   // for debug


#define FFT(a,b,c,d)  fht_fft(a,b,c,d)
#define FFT0(a,b,c,d) fht_fft0(a,b,c,d)

#if 0
#define kd_copy1(a,b,c,d) kd_debug(a,b,c,d,wramr,wramsz,n)
#define dk_copy1(a,b,c,d) dk_debug(a,b,c,d,wramr,wramsz,n)
#define kd_copy2(a,b,c,d) kd_debug(a,b,c,d,wrami,wramsz,n)
#define dk_copy2(a,b,c,d) dk_debug(a,b,c,d,wrami,wramsz,n)
#else
#define kd_copy1(a,b,c,d) kd_copy(a,b,c,d)
#define dk_copy1(a,b,c,d) dk_copy(a,b,c,d)
#define kd_copy2(a,b,c,d) kd_copy(a,b,c,d)
#define dk_copy2(a,b,c,d) dk_copy(a,b,c,d)
#endif

int mass_digit_mul(LIMB *ai, long an, LIMB *bi, long bn, LIMB *res, long resn)
//
// LIMB fields ai,bi are multiplied
// result goes to res
//
{
    long n,ret;
    FILE *afile;

    afile=mass_digit_convolution(ai,an,bi,bn,n);

    PR( kd_print("\n datt kricht carry: \n",afile,n); )

    PRX( cout<<"\n n===="<<n; )
    ret=mass_carry(afile,n,res,resn);

    PR( i_print("\n mass_mult_dig(): result:\n",res,MIN(64,resn)); )

    return ret;
}
// ================= end MASS_DIGIT_MULTIPLY ==========================



#define USE_MAX_ROW_LEN  // improve disk access 

#define ROW (k1)
#define COL (k2+n2w*wct)

#define ROW2 (k1)
#define COL2 (k2+n2w*wct)

#define is (+1)  // use +1


FILE* mass_digit_convolution(LIMB *ai, long an, LIMB *bi, long bn, long &n)
//
// n is output 
//
{
    int ldnx;
    LIMB *pa,*pb;
    
    // --- data on disk:
    FILE *rfile, *ifile;    // the big files
    //long n;               // total # of complex vals  =n1*n2
    long n1,n2;             // n1=len of row (or #of cols), 
    int ldn,ldn1,ldn2;
    size_t pf;
    
    // --- the workspace:
    long wramsz;           // size (in complex) of ...
    double *wramr,*wrami;  //   RAM workspace
    long wct;
    double  *pwr=NULL, *pwi=NULL; 
    
    // --- the buffer:
    long bufsz;
    double *bufr,*bufi;
    
    long v;       
    long n1w,n2w; 
    long k1,k2;
    double ph0;

    PRX( i_print("\n\n mass_dig_cnvl(): ai=\n",ai,MIN(an,10)); )
    PRX( i_print("\n\n mass_dig_cnvl(): bi=\n",bi,MIN(bn,10)); )

    ldnx=ld(an);
    assert( an==(((long)1)<<ldnx) );
    assert( an==bn );

    ldn=ldnx+1;     // factor 2 for zero padding 
    n=(((long)1)<<ldn); 
    PRX( cout<<"\n\n mass_dig_cnvl(): ldn="<<ldn; )

    // --- workspace:
    wramsz=hfg_wsdbls/2;  // re+im
    assert(wramsz<n);     // else this routine makes no sense
    PRX( cout<<"\n  wramsz="<<wramsz; )
    wramr=(double *)gws.get_ws1();
    wrami=(double *)gws.get_ws2();

    v=n/wramsz;  // ratio  (total data)/(size of workspace)
    PRX( cout<<"\n we will loop "<<v<<" times thru workspace: \n"; )

    // --- n1,n2:
    ldn1=(ldn>>1);
    ldn2=ldn-ldn1;

#if defined USE_MAX_ROW_LEN  
    int ldmaxrowlen=ld(wramsz)-1; // fails if set to ld(wramsz), makes no sense anyway
    PRX( cout<<"\n ldn2="<<ldn2<<"  ldmaxrowlen="<<ldmaxrowlen; )

    while(ldn2<ldmaxrowlen) // one complete row must fit into workspace
    {
        PRX( cout<<" - "; )
	ldn1--;
	ldn2++;
    }
#endif  // defined USE_MAX_ROW_LEN

    n1=(1<<ldn1);
    n2=(1<<ldn2);  
    PRX( cout<<"\n ldn="<<ldn<<"  ldn1="<<ldn1<<"  ldn2="<<ldn2; )
    PRX( cout<<"\n n="<<n<<"  n1="<<n1<<"  n2="<<n2; )
    assert( n2>=n1 ); 

    n2w=n2/v;     // how many cols fit into workspace
    n1w=n1/v;     // how many rows fit into workspace
    PRX( cout<<"\n n1w="<<n1w<<"  n2w="<<n2w; )

    assert(wramsz==n1w*n2);
    assert(wramsz==n2w*n1);

    // --- buffer: (workspace for the transformations over columns)
    bufsz=n1;              
    bufr=new double[bufsz]; assert(bufr);
    bufi=new double[bufsz]; assert(bufi);

    PRX( cout<<"\n 'csz'=n="<<n<<"  wramsz="<<wramsz<<"  bufsz="<<bufsz; )
    assert(bufsz<=wramsz);
    assert(wramsz<=n);
    assert(wramsz>=n2);


    // --- data on disk:
    rfile=fopen(hfg_file0,"w+"); assert(rfile);          
    ifile=fopen(hfg_file1,"w+"); assert(ifile);          

    //----------- PART 1: mass storage fourier ---------------------------
    ph0=is*2.0*M_PI/n;
    for(pf=0,wct=0; wct<v; ++wct,pf+=wramsz) // ---------- fft and exp cols 
    {
        // fill workspace from LIMBS  (get n2w cols from data): 
        // must fail for n1==1 !
        // condition is <n1/2 for zero padded data (else <n1):
	for(pa=ai+wct*n2w,pb=bi+wct*n2w,pwr=wramr,pwi=wrami,k1=0; 
            k1<n1/2; 
            ++k1,pa+=n2,pb+=n2,pwr+=n2w,pwi+=n2w)
	{
	    id_copy(pa,pwr,n2w);
	    id_copy(pb,pwi,n2w);
	}

        // fill workspace: zero pad 
	d_null(wramr+wramsz/2,wramsz/2);
	d_null(wrami+wramsz/2,wramsz/2);  

        // loop thru cols in workspace:
        for(k2=0; k2<n2w; ++k2)
	{
            for(pwr=wramr+k2,pwi=wrami+k2,k1=0; k1<n1; ++k1,pwr+=n2w,pwi+=n2w)
            {
                bufr[k1]=*pwr;
                bufi[k1]=*pwi;
            } 

            FFT0(bufr,bufi,ldn1,is);  

            for(pwr=wramr+k2,pwi=wrami+k2,k1=0; k1<n1; ++k1,pwr+=n2w,pwi+=n2w)  
            {
                double c,s,phi;
                phi=ph0*ROW*COL;
                sincos(&c,&s,phi);
                cmult6(bufr[k1],bufi[k1],c,s,*pwr,*pwi);
            } 
	}

	for(pf=wct*n2w,pwr=wramr,k1=0; k1<n1; ++k1,pf+=n2,pwr+=n2w)  
	{
            dk_copy1(rfile,pwr,pf,n2w);
	}

	for(pf=wct*n2w,pwi=wrami,k1=0; k1<n1; ++k1,pf+=n2,pwi+=n2w)
	{
            dk_copy2(ifile,pwi,pf,n2w);
	}
    }

    // ---------- fft rows:
    for(pf=0,wct=0; wct<v; ++wct,pf+=wramsz)
    {
        kd_copy1(rfile,pf,wramr,wramsz);
        kd_copy2(ifile,pf,wrami,wramsz);  
         
        // loop thru rows:
        for(pwr=wramr,pwi=wrami,k1=0; k1<n1w; ++k1,pwr+=n2,pwi+=n2) 
        {
            assert( (pwr-wramr)<wramsz );
            FFT(pwr,pwi,ldn2,is);
            complex_sqr(pwr,pwi,n2); // for convolution

/* ************** begin shortcut ****************** (saves one pass thru data) 
 *        }
 *
 *        dk_copy1(rfile,wramr,pf,wramsz);
 *        dk_copy2(ifile,wrami,pf,wramsz);  
 *    }
 *
 *    // if the *** shortcut is not made (and the data not squared)
 *    // we got the fourier transformed (zero padded) data here in a _transposed_ form
 *    // (in the files) 
 *
 *    //-------- PART 2: mass storage fourier back ---------------------------
 *    for(pf=0,wct=0; wct<v; ++wct,pf+=wramsz) // ---------- fft 'rows' in c
 *    {
 *        kd_copy1(rfile,pf,wramr,wramsz);  
 *        kd_copy2(ifile,pf,wrami,wramsz);  
 *         
 *        // loop thru rows:
 *        for(pwr=wramr,pwi=wrami,k1=0; k1<n1w; ++k1,pwr+=n2,pwi+=n2)
 *        {
 *           assert( (pwr-wramr)<wramsz );
 *           assert( (pwr-wramr+n2)<wramsz );
 *
 ******************** end shortcut *********************** */

            FFT(pwr,pwi,ldn2,-is);
        }

        dk_copy1(rfile,wramr,pf,wramsz);   // DISK write
        dk_copy2(ifile,wrami,pf,wramsz);
    }

    ph0=-is*2.0*M_PI/n;
    for(pf=0,wct=0; wct<v; ++wct,pf+=wramsz) // ---------- exp and fft 'cols' 
    {
        // fill workspace from DISK (get n2w cols from data):
	for(pf=wct*n2w,pwr=wramr,k1=0; k1<n1; ++k1,pf+=n2,pwr+=n2w)
	{
            kd_copy1(rfile,pf,pwr,n2w);
	}
	
	for(pf=wct*n2w,pwi=wrami,k1=0; k1<n1; ++k1,pf+=n2,pwi+=n2w)
	{
            kd_copy2(ifile,pf,pwi,n2w);
	}
         
        // loop thru cols in workspace:
        for(k2=0; k2<n2w; ++k2)
	{
            for(pwr=wramr+k2,pwi=wrami+k2,k1=0; k1<n1; ++k1,pwr+=n2w,pwi+=n2w)  
            {
                bufr[k1]=*pwr;
                bufi[k1]=*pwi;
            } 

            for(k1=0; k1<n1; ++k1)
            {
                double c,s,phi;
                phi=ph0*ROW2*COL2;
                sincos(&c,&s,phi);
                cmult(c,s,bufr[k1],bufi[k1]);
            } 

            FFT(bufr,bufi,ldn1,-is);  

            for(pwi=wrami+k2,k1=0; k1<n1; ++k1,pwi+=n2w)
            {
                *pwi=bufi[k1];
            } 
	}
 
        d_mul(wrami,wramsz,1.0/(n+n));

	for(pf=wct*n2w,pwi=wrami,k1=0; k1<n1; ++k1,pf+=n2,pwi+=n2w)
	{
            dk_copy2(ifile,pwi,pf,n2w);
	}
    }

    // the convolved data is now in ifile (in normal order):
    gws.let_ws1();
    gws.let_ws2();

    fclose(rfile);

    return ifile;
} 
/*===================== end MASS_DIGIT_CONVOLUTION =========================*/
