#define CHIP_6416
#include <dsk6416.h>
#include <stdio.h> /* printf() */
#include <string.h> /* memset() */
#include <csl_timer.h>
#include <csl_dat.h> /* DMA */
#include "IMG_wave_horz.h"
#include "IMG_wave_vert.h"

#include "image.h" /* image\kernel dimensions, example pixel data */
#pragma DATA_ALIGN (wcoefs, 8);
#pragma DATA_SECTION (wcoefs, "SDRAM");

/* D4 WAVELET FILTER COEFFICIENTS */
#pragma DATA_ALIGN (d4_qmf_Q15, 8);
#pragma DATA_ALIGN (d4_mqmf_Q15, 8);
short d4_qmf_Q15[]  = {-4240,7345,27411,15826,0,0,0,0},
	  d4_mqmf_Q15[] = {-15826,27411,-7345,-4240,0,0,0,0};

#pragma DATA_ALIGN (wvlt_in_buf, 8);
short wvlt_in_buf[Y_SIZE*8]; /* scratch (input) buffer */

#pragma DATA_ALIGN (wvlt_out_buf, 8);
short wvlt_out_buf[Y_SIZE*8]; /* scratch (output) buffer */

short *pwvbufs[8]; /* IMG_wave_vert() input */

#pragma DATA_ALIGN (horzcoefs, 8);
#pragma DATA_SECTION (horzcoefs, "SDRAM"); 
short horzcoefs[N_PIXELS]; /* IMG_wave_horz() output */

/* copy a potentially non-contiguous block of data into internal RAM  */
inline
void fetch_data(short *pSrc, int nRows, int nCols, short *pDst)
{
  Uint32 id_EDMAin = DAT_copy2d(DAT_2D1D, pSrc, pDst, nCols*sizeof(short), nRows, Y_SIZE*sizeof(short));
  DAT_wait(id_EDMAin);
}

/* send data out to wcoefs */
inline
Uint32 page_out_contiguous_block(short *pSrc, int nElems, short *pDst)
{
  return DAT_copy2d(DAT_2D1D, pSrc, pDst, nElems*sizeof(short), 1, nElems*sizeof(short));  
}

/* horizontal wavelet transform, output goes into horzcoefs */
void transform_rows(int level, int nCols)
{
  const int nBlockRows = 8<<level;
  const int nBlocks = X_SIZE>>(3+level);
  int iRow=0, iBlock=0, kk;
  short *pin, *pout;
  Uint32  id_EDMAout = DAT_XFRID_WAITNONE;

  /* pass rows through DWT, in groups of 8 scan-lines*/
  do {
    /* get next block of rows */
    fetch_data(&wcoefs[iRow*Y_SIZE], nBlockRows, nCols, wvlt_in_buf);

	pin = wvlt_in_buf;
	pout = wvlt_out_buf;
	for (kk=0; kk<nBlockRows; ++kk, pin+=nCols, pout+=nCols)
	  IMG_wave_horz(pin, d4_qmf_Q15, d4_mqmf_Q15, pout, nCols);
	
	/* write horizontal wavelet coeffs out to ext mem storage */
	DAT_wait(id_EDMAout);
	id_EDMAout = page_out_contiguous_block(wvlt_out_buf, nBlockRows*nCols, horzcoefs+iRow*nCols);
	iRow += nBlockRows;
  } while (++iBlock <= nBlocks);
}

/* grab next two lines for IMG_wave_vert, see Figure 6-9 */
inline
int fetch_horz_wavelet_scanlines(int r, int nCols)
{
	short *ptemp1 = pwvbufs[0], *ptemp2 = pwvbufs[1];
	pwvbufs[0] = pwvbufs[2];
	pwvbufs[1] = pwvbufs[3];
	pwvbufs[2] = pwvbufs[4];
	pwvbufs[3] = pwvbufs[5];
	pwvbufs[4] = pwvbufs[6];
	pwvbufs[5] = pwvbufs[7];
	pwvbufs[6] = ptemp1;
	pwvbufs[7] = ptemp2;
	fetch_data(horzcoefs+r*nCols, 1, nCols<<1, pwvbufs[6]);
	return r+2;
}

/* vertical wavelet transform, output goes into wcoefs */
void transform_cols(int level, int nCols)
{
  const int nRows = X_SIZE>>level, nRowsDiv2 = nRows>>1,
            circular = nRowsDiv2-1;
  int lpRow = nRowsDiv2-3, /* low-pass vert output */
      hpRow = nRowsDiv2,   /* high-pass vert output */
      fetchRow = 2, iRow;
  short *plpvc = wvlt_out_buf, /* ptr to low-pass vert coeffs */
        *phpvc = wvlt_out_buf+nCols; /* high-pass vert coeffs */
  Uint32 id_EDMAout1 = DAT_XFRID_WAITNONE,
         id_EDMAout2 = DAT_XFRID_WAITNONE;

  /* setup scan-lines for DWT down the columns */
  fetch_data(horzcoefs+(nRows*nCols)-(6*nCols), 1, 6*nCols, wvlt_in_buf);
  fetch_data(horzcoefs, 1, nCols<<1, wvlt_in_buf+6*nCols);
  pwvbufs[0] = wvlt_in_buf; 
  pwvbufs[1] = wvlt_in_buf+nCols;
  pwvbufs[2] = wvlt_in_buf+2*nCols;
  pwvbufs[3] = wvlt_in_buf+3*nCols;
  pwvbufs[4] = wvlt_in_buf+4*nCols;
  pwvbufs[5] = wvlt_in_buf+5*nCols;
  pwvbufs[6] = wvlt_in_buf+6*nCols;
  pwvbufs[7] = wvlt_in_buf+7*nCols;

  /* now march through the image and DWT the columns */
  for (iRow=0; iRow<nRowsDiv2; ++iRow) {
    IMG_wave_vert(pwvbufs, d4_qmf_Q15, d4_mqmf_Q15, plpvc, phpvc, nCols);
    DAT_wait(id_EDMAout1);
    id_EDMAout1 = page_out_contiguous_block(plpvc, nCols, wcoefs+lpRow*Y_SIZE);
    DAT_wait(id_EDMAout2);
    id_EDMAout2 = page_out_contiguous_block(phpvc, nCols, wcoefs+hpRow*Y_SIZE);
    fetchRow = fetch_horz_wavelet_scanlines(fetchRow, nCols);
    lpRow += 1; if (lpRow>circular) lpRow=0; /* lpRow = (lpRow+1) % nRowsDiv2 */
    hpRow += 1;
  }
}

/* multi-level wavelet decomposition */
void dwt2d(int N)
{
  int nCols = Y_SIZE, level;
  for (level=0; level<N; ++level, nCols>>=1) {
    transform_rows(level, nCols);
    transform_cols(level, nCols);
  }
}

int main(void)
{
  TIMER_Handle hTimer;
  unsigned int start, stop, overhead, total = 0, t; /* timing */
  const int N = 1; /* how many times to profile */
  int ii = 0;
      
  DSK6416_init(); /* initialize the DSK board support library */
  
  /* configure timer */
  hTimer = TIMER_open(TIMER_DEVANY,0); 
  TIMER_configArgs(hTimer, 0x000002C0, 0xFFFFFFFF, 0x00000000);  
  
  /* initialize EDMA */
  DAT_open(DAT_CHAANY, DAT_PRI_HIGH, DAT_OPEN_2D);
  
  /* compute overhead of calling the timer. */  
  start    = TIMER_getCount(hTimer);  /* called twice to avoid L1D miss.  */
  start    = TIMER_getCount(hTimer); 
  stop     = TIMER_getCount(hTimer); 
  overhead = stop - start;

  for (; ii<N; ++ii) {
    start = clock(); /* begin "profile area" */
      dwt2d(2);
    stop = clock(); /* end "profile area" */
    t = (stop-start-overhead) * 8;
    total += t;
    printf("# cycles to filter image: %d\n", t);
  }
  
  printf("avg time is %.2f cycles.\n", (float)total/(float)N);
}
