

#define USE_MAX_ROW_LEN  // improve disk access 

#define ROW (k1)
#define COL (k2+n2w*wct)

#define ROW2 (k1)
#define COL2 (k2+n2w*wct)

#define is (+1)  // use +1



FILE* mass_digit_convolution(LIMB *ai, long aprec, LIMB *bi, long bprec, long &n)
//
// n is output 
//
{
    int ldnx;
    LIMB *pa,*pb;
    
    // --- data on disk:
    FILE *rfile, *ifile;     // the big files
    //long n;               // total # of double vals
    long n1,n2;
    int ldn,ldn1,ldn2;
    size_t pf;
    
    // --- the workspace:
    long wramsz;          // size of ...
    double *wramr,*wrami;  //   RAM workspace
    long wct;
    double  *pwr=NULL, *pwi=NULL; 
    
    // --- the buffer:
    long bufsz;
    double *bufr,*bufi;
    
    long v;       
    long n1w,n2w; 
    long k1,k2;
    double ph0;

    PRX( i_print("\n\n mass_dig_cnvl(): ai=\n",ai,MIN(aprec,10)); )
    PRX( i_print("\n\n mass_dig_cnvl(): bi=\n",bi,MIN(bprec,10)); )

    ldnx=ld(aprec);
    assert(aprec==(((long)1)<<ldnx));
    assert(aprec==bprec);

    ldn=ldnx+1;     // factor 2 for zero padding 
    n=(((long)1)<<ldn); 

    PRX( cout<<"\n\n mass_dig_cnvl(): ldn="<<ldn; )

    // --- workspace:
    wramsz=hfg_wsdbls/2;  // re+im

    if(wramsz>=n)  // should be avoided anyway
    {
        PRX( cout<<"\n wramsz>=n ! "; )
        wramsz=n;
    }
    PRX( cout<<"\n  wramsz="<<wramsz; )

    wramr=(double *)gws.get_ws1();
    wrami=(double *)gws.get_ws2();

    // debug: test with more passes:
    //wramsz/=2; 

    v=n/wramsz;  // ratio  (total data)/(size of workspace)

    // --- n1,n2:
    ldn1=(ldn>>1);
    ldn2=ldn-ldn1;

#if defined USE_MAX_ROW_LEN  
    int ldmaxrowlen=ld(wramsz)-1; // fails if set to ld(wramsz), makes no sense anyway

    PRX( cout<<"\n ldn2="<<ldn2<<"  ldmaxrowlen="<<ldmaxrowlen; )

    while(ldn2<ldmaxrowlen) // one complete row must fit into workspace
    {
        PRX( cout<<" - "; )
	ldn1--;
	ldn2++;
    }
#endif  // defined USE_MAX_ROW_LEN

    n1=(1<<ldn1);
    n2=(1<<ldn2);  // n2>=n1 

    PRX( cout<<"\n ldn="<<ldn<<"  ldn1="<<ldn1<<"  ldn2="<<ldn2; )
    PRX( cout<<"\n n="<<n<<"  n1="<<n1<<"  n2="<<n2; )

    // --- data on disk:
    rfile=fopen(hfg_file0,"w+");
    assert(rfile);          
    ifile=fopen(hfg_file1,"w+");
    assert(ifile);          

    n2w=n2/v;     // how many cols fit into workspace
    n1w=n1/v;     // how many rows fit into workspace
    PRX( cout<<"\n n1w="<<n1w<<"  n2w="<<n2w; )

    assert(wramsz==n1w*n2);
    assert(wramsz==n2w*n1);

    // --- buffer: (workspace for the transformations over columns)
    bufsz=n1;              
    bufr=new double[bufsz];
    assert(bufr);
    bufi=new double[bufsz];
    assert(bufi);

    PRX( cout<<"\n 'csz'=n="<<n<<"  wramsz="<<wramsz<<"  bufsz="<<bufsz; )

    assert(bufsz<=wramsz);
    assert(wramsz<=n);
    assert(wramsz>=n2);

    PRX( cout<<"\n we will loop "<<v<<" times thru workspace: \n"; )

//----------- PART 1: mass storage fourier ---------------------------
    PRV("\n\n ------ start part 1+2:");

    ph0=is*2.0*M_PI/n;
    for(pf=0,wct=0; wct<v; ++wct,pf+=wramsz) // ---------- fft and exp cols 
    {
        // fill workspace from LIMBS  (get n2w cols from data): 
        // must fail for n1==1 !
        // condition is <n1/2 for zero padded data (else <n1):
	for(pa=ai+wct*n2w,pb=bi+wct*n2w,pwr=wramr,pwi=wrami,k1=0; 
            k1<n1/2; 
            ++k1,pa+=n2,pb+=n2,pwr+=n2w,pwi+=n2w)
	{
	    id_copy(pa,pwr,n2w);
	    id_copy(pb,pwi,n2w);
	}

        // fill workspace: zero pad 
	d_null(wramr+wramsz/2,wramsz/2);
	d_null(wrami+wramsz/2,wramsz/2);  

        // loop thru cols in workspace:
        for(k2=0; k2<n2w; ++k2)
	{
            for(pwr=wramr+k2,pwi=wrami+k2,k1=0; k1<n1; ++k1,pwr+=n2w,pwi+=n2w)
            {
                bufr[k1]=*pwr;
                bufi[k1]=*pwi;
            } 

            FFT0(bufr,bufi,ldn1,is);  

            for(pwr=wramr+k2,pwi=wrami+k2,k1=0; k1<n1; ++k1,pwr+=n2w,pwi+=n2w)  
            {
                double c,s,phi;
                phi=ph0*ROW*COL;
                sincos(&c,&s,phi);
                cmult6(bufr[k1],bufi[k1],c,s,*pwr,*pwi);
            } 
	}

	for(pf=wct*n2w,pwr=wramr,k1=0; k1<n1; ++k1,pf+=n2,pwr+=n2w)  
	{
            dk_copy1(rfile,pwr,pf,n2w);
	}

	for(pf=wct*n2w,pwi=wrami,k1=0; k1<n1; ++k1,pf+=n2,pwi+=n2w)
	{
            dk_copy2(ifile,pwi,pf,n2w);
	}
    }

    // ---------- fft rows:
    for(pf=0,wct=0; wct<v; ++wct,pf+=wramsz)
    {
        kd_copy1(rfile,pf,wramr,wramsz);
        kd_copy2(ifile,pf,wrami,wramsz);  
         
        // loop thru rows:
        for(pwr=wramr,pwi=wrami,k1=0; k1<n1w; ++k1,pwr+=n2,pwi+=n2) 
        {
            assert( (pwr-wramr)<wramsz );
            FFT(pwr,pwi,ldn2,is);
            complex_sqr(pwr,pwi,n2); // for convolution

/* ************** begin shortcut ****************** (saves one pass thru data) 
 *        }
 *
 *        dk_copy1(rfile,wramr,pf,wramsz);
 *        dk_copy2(ifile,wrami,pf,wramsz);  
 *    }
 *
 *
 * // if the *** shortcut is not made (and the data not sqrd)
 * // we got the fourier transformed (zero padded) data here in a _transposed_ form
 * // (in the files) 
 *
 * //-------- PART 2: mass storage fourier back ---------------------------
 *
 *    for(pf=0,wct=0; wct<v; ++wct,pf+=wramsz) // ---------- fft 'rows' in c
 *    {
 *        kd_copy1(rfile,pf,wramr,wramsz);  
 *        kd_copy2(ifile,pf,wrami,wramsz);  
 *         
 *        // loop thru rows:
 *        for(pwr=wramr,pwi=wrami,k1=0; k1<n1w; ++k1,pwr+=n2,pwi+=n2)
 *        {
 *           assert( (pwr-wramr)<wramsz );
 *           assert( (pwr-wramr+n2)<wramsz );
 *
 ******************** end shortcut *********************** */

            FFT(pwr,pwi,ldn2,-is);
        }

        dk_copy1(rfile,wramr,pf,wramsz);   // DISK write
        dk_copy2(ifile,wrami,pf,wramsz);
    }

    ph0=-is*2.0*M_PI/n;
    for(pf=0,wct=0; wct<v; ++wct,pf+=wramsz) // ---------- exp and fft 'cols' 
    {
        // fill workspace from DISK (get n2w cols from data):
	for(pf=wct*n2w,pwr=wramr,k1=0; k1<n1; ++k1,pf+=n2,pwr+=n2w)
	{
            kd_copy1(rfile,pf,pwr,n2w);
	}
	
	for(pf=wct*n2w,pwi=wrami,k1=0; k1<n1; ++k1,pf+=n2,pwi+=n2w)
	{
            kd_copy2(ifile,pf,pwi,n2w);
	}
         
        // loop thru cols in workspace:
        for(k2=0; k2<n2w; ++k2)
	{
            for(pwr=wramr+k2,pwi=wrami+k2,k1=0; k1<n1; ++k1,pwr+=n2w,pwi+=n2w)  
            {
                bufr[k1]=*pwr;
                bufi[k1]=*pwi;
            } 

            for(k1=0; k1<n1; ++k1)
            {
                double c,s,phi;
                phi=ph0*ROW2*COL2;
                sincos(&c,&s,phi);
                cmult(c,s,bufr[k1],bufi[k1]);
            } 

            FFT(bufr,bufi,ldn1,-is);  

            for(pwi=wrami+k2,k1=0; k1<n1; ++k1,pwi+=n2w)
            {
                *pwi=bufi[k1];
            } 
	}
 
        d_mul(wrami,wramsz,1.0/(n+n));

	for(pf=wct*n2w,pwi=wrami,k1=0; k1<n1; ++k1,pf+=n2,pwi+=n2w)
	{
            dk_copy2(ifile,pwi,pf,n2w);
	}
    }

    // the convolved data is now in ifile (in normal order):
    gws.let_ws1();
    gws.let_ws2();

    fclose(rfile);

    return ifile;
} 
/*===================== end MASS_DIGIT_CONVOLUTION =========================*/
