#define CHIP_6416
#include <dsk6416.h>
#include <stdio.h>
#include <time.h> /* for clock() */

#include "image.h" /* image\kernel dimensions, example pixel data */
#pragma DATA_ALIGN (wcoefs, 8);
#pragma DATA_SECTION (wcoefs, "SDRAM");

#pragma DATA_ALIGN (horzcoefs, 8);
#pragma DATA_SECTION (horzcoefs, "SDRAM"); 
short horzcoefs[N_PIXELS]; /* IMG_wave_horz() output */

#define Qr 16384 /* Q15 round (0.5) */

#define MAX_LEVELS 6 /* min(X_SIZE_LOG_2,Y_SIZE_LOG_2)-2 */

clock_t start, stop, overhead; /* timing */

/*
 * forward DWT along the horizontal (row) direction
 */
void wave_horz
(
  short *in_data,  /* row of input pixels                     */
  const short *hLP, /* low-pass 4-tap D4 decomposition filter */
  const short *hHP, /* high-pass 4-tap decomposition filter   */
  short *out_data, /* row of output data                      */
  int    cols      /* length of input                         */
)
{
  int         ii, iters = (cols>>1) - 2;
  const short *xptr;
  int         sum;
  double x0_x1_x2_x3,
         hLP0_hLP1_hLP2_hLP3 = _memd8_const(hLP);
  int hLP0_hLP1 = _lo(hLP0_hLP1_hLP2_hLP3),
      hLP2_hLP3 = _hi(hLP0_hLP1_hLP2_hLP3),
      x0_x1, x2_x3;

  /* ------------------------------------------------- */
  /*  Convolve input with 4 tap low-pass filter,       */
  /*  followed by a downsampling by 2x. Periodize the  */
  /*  signal by moving the last three pixels to the    */
  /*  beginning of the sequence.                       */
  /* ------------------------------------------------- */

  /* periodization: 1st two times through the loop */
  *out_data++ = (hLP[0]*in_data[cols-3] + hLP[1]*in_data[cols-2] + 
                 hLP[2]*in_data[cols-1] + hLP[3]*in_data[0]) >> 15;
  *out_data++ = (hLP[0]*in_data[cols-1] + hLP[1]*in_data[0] + 
                 hLP[2]*in_data[1]      + hLP[3]*in_data[2]) >> 15;
	
  /* rest of the low-pass filter + decimation loop */
  xptr = in_data + 1;
#if 1
start = clock(); /* begin "profile area" */
  #pragma MUST_ITERATE(1<<(Y_SIZE_LOG_2-MAX_LEVELS), Y_SIZE, 2)
  for (ii=0; ii<iters; ++ii, xptr+=2) {
    sum = Qr + 
	      xptr[0]*hLP[0] + 
          xptr[1]*hLP[1] + 
          xptr[2]*hLP[2] + 
          xptr[3]*hLP[3];

    *out_data++ = sum>>15;
  }
stop = clock(); /* end "profile area" */
#else
start = clock(); /* begin "profile area" */
  for (ii=0; ii<iters; ++ii, xptr+=2) {
    x0_x1_x2_x3 = _memd8_const(xptr);
    x0_x1 = _lo(x0_x1_x2_x3);
    x2_x3 = _hi(x0_x1_x2_x3);
    sum = Qr + _dotp2(x0_x1, hLP0_hLP1) + _dotp2(x2_x3, hLP2_hLP3);
    *out_data++ = sum>>15;
  }
stop = clock(); /* end "profile area" */
#endif
  printf("# cycles = %d\n", stop-start-overhead);

  /* ------------------------------------------------- */
  /*  Same as above but this time convolve with the    */
  /*  high-pass filer.                                 */
  /* ------------------------------------------------- */

  *out_data++ = (hHP[0]*in_data[cols-3] + hHP[1]*in_data[cols-2] + 
                 hHP[2]*in_data[cols-1] + hHP[3]*in_data[0]) >> 15;
  *out_data++ = (hHP[0]*in_data[cols-1] + hHP[1]*in_data[0] + 
                 hHP[2]*in_data[1]      + hHP[3]*in_data[2]) >> 15;

  xptr = in_data + 1;
  for (ii=0; ii<iters; ++ii, xptr+=2) {
    sum  = Qr +
           xptr[0]*hHP[0] +
           xptr[1]*hHP[1] +
           xptr[2]*hHP[2] +
           xptr[3]*hHP[3];

    *out_data++ =  sum>>15;
  }
}

/*
 * forward DWT along the vertical (col) direction
 */
 void wave_vert
(
  const short *
  const       *in_data,    /* array of row pointers                   */
  const short *gLP,        /* low-pass 4-tap D4 decomposition filter  */
  const short *gHP,        /* high-pass 4-tap D4 decomposition filter */
  short       *out_approx, /* approximation coefficients              */
  short       *out_detail, /* detail coefficients                     */
  int         cols         /* length of rows to process               */
)
{
  int ii;
  int approx, detail;

  /*-------------------------------------------------------------------*/
  /* In comparison to wave_horz, much simpler (most of the             */
  /* complications are dealt outside of the function, by the callee).  */
  /* Simply run across the current row, passing 4 pixels in the        */
  /* vertical (column) direction through the low-pass and high-pass    */
  /* decomposition filters.                                            */
  /*-------------------------------------------------------------------*/

  for (ii=0; ii<cols; ++ii) {
    approx = gLP[0]*in_data[0][ii] + gLP[1]*in_data[1][ii] +
             gLP[2]*in_data[2][ii] + gLP[3]*in_data[3][ii];
    *out_approx++ = approx>>15;
    detail = gHP[0]*in_data[0][ii] + gHP[1]*in_data[1][ii] +
             gHP[2]*in_data[2][ii] + gHP[3]*in_data[3][ii];
    *out_detail++ = detail>>15;
  }
}

/*
 * inverse DWT along the horizontal (row) direction
 */
void invwave_horz(    
  short *in_data,  /* row of input pixels                    */
  const short *gLP, /* low-pass 4-tap reconstruction filter  */
  const short *gHP, /* high-pass 4-tap reconstruction filter */
  short *out_data, /* row of output data                     */
  int    cols      /* length of input                        */
)
{
  int   out1, out2, ii;
  short *pa = in_data, /* start of approx coefs */
        *pd = in_data + (cols>>1), /* start of detail coefs */
        *in_detail = pd,
        *pout = out_data;

  /* ------------------------------------------------- */
  /*  This loop is equivalent to splicing out the      */
  /*  approximation and detail coefficients out of     */
  /*  in_data, upsampling both by a factor of 2 (by    */
  /*  inserting zeros in between every sample),        */
  /*  passing both upsampled signals through their     */
  /*  respective reconstruction filter, and finally    */
  /*  adding the results.                              */
  /* ------------------------------------------------- */

  /* this loop is unrolled by a factor of 2 */
  for (ii=0; ii<cols-4; ii+=2, ++pa, ++pd) {
    out1 = gLP[3]*pa[0]+gLP[1]*pa[1] + /* filtered approximation */
           gHP[3]*pd[0]+gHP[1]*pd[1];  /* filtered detail */
    *pout++ = out1>>15;
    out2 = gLP[2]*pa[1]+gLP[0]*pa[2] + /* filtered approximation */ 
           gHP[2]*pd[1]+gHP[0]*pd[2];  /* filtered detail */
    *pout++ = out2>>15;
  }

  /* periodization (wrap-around with final 4 samples */
  out1 = (gLP[3]*pa[0]+gLP[1]*pa[1]) + 
         (gHP[3]*pd[0]+gHP[1]*pd[1]);
  *pout++ = out1>>15;
  out2 = (gLP[2]*pa[1]+gLP[0]*in_data[0]) + 
         (gHP[2]*pd[1]+gHP[0]*in_detail[0]);
  *pout++ = out2>>15;
  out1 = (gLP[3]*pa[1]+gLP[1]*in_data[0]) + 
         (gHP[3]*pd[1]+gHP[1]*in_detail[0]);
  *pout++ = out1>>15;
  out2 = (gLP[2]*in_data[0]+gLP[0]*in_data[1]) + 
         (gHP[2]*in_detail[0]+gHP[0]*in_detail[1]);
  *pout++ = out2>>15;
}

/*
 * inverse DWT along the vertical (col) direction
 */
void invwave_vert
(
  const short *
  const       *in_data,   /* array of row pointers                 */
  short       *gLP,       /* low-pass 4-tap reconstruction filter  */
  short       *gHP,       /* high-pass 4-tap reconstruction filter */
  short       *out_data1, /* 1st output row                        */     
  short		  *out_data2, /* 2nd output row                        */
  int          cols       /* length of rows to process             */
)
{
  int out1, out2, ii;

  /*-------------------------------------------------------------------*/
  /* This loop works in a similar fashion to that of wave_vert. For    */
  /* each column in the two rows pointed to by out_data1 and           */
  /* out_data2, perform the inverse DWT in the vertical direction      */
  /* by convolving upsampled approximation and detail portions of the  */
  /* the input with the respective reconstruction filters. Except for  */
  /* a single periodization case, out_data1 and out_data2 will point   */
  /* consecutive rows in the output image                              */
  /*-------------------------------------------------------------------*/

  for (ii=0; ii<cols; ++ii) {
    out1 = gLP[2]*in_data[0][ii]+gLP[0]*in_data[1][ii] + 
           gHP[2]*in_data[2][ii]+gHP[0]*in_data[3][ii];
    *out_data1++ = out1>>15;
    out2 = gLP[3]*in_data[0][ii]+gLP[1]*in_data[1][ii] + 
           gHP[3]*in_data[2][ii]+gHP[1]*in_data[3][ii];
    *out_data2++ = out2>>15; 
  }
}

/* multi-level wavelet decomposition (1st horz, then vert) */
void dwt2d(int nLevels)
{
  //const short d4_decomp_lp_Q15[] = {15826,27411,7345,-4240},
  //            d4_decomp_hp_Q15[] = {-4240,-7345,27411,-15826};
  const short d4_decomp_lp_Q15[] = {-4240,7345,27411,15826},
              d4_decomp_hp_Q15[] = {-15826,27411,-7345,-4240};

  int nlr = X_SIZE, /* # rows in current level */
      nlc = Y_SIZE, /* # cols in current level */
      ilevel = 0, irow, ii;

  /* for wave_vert */      
  short *pwvbufs[4]; /* ptrs to input scan-lines */
  short *plopass, *phipass; /* output ptrs */
  int nrow, scol;

  for (; ilevel<nLevels; ++ilevel, nlr>>=1, nlc>>=1) {

    /* transform the rows */
    for (irow=0; irow<nlr; ++irow)
      wave_horz(wcoefs+irow*Y_SIZE, /* input (row n) */
                d4_decomp_lp_Q15, d4_decomp_hp_Q15, 
                horzcoefs+irow*Y_SIZE, /* output */
                nlc);
                
    /* transform the cols */
    nrow = nlr-3;
    
    /* periodization: 1st time through input rows N-2, N-1, N, and 1st */
    pwvbufs[0]=horzcoefs+nrow*Y_SIZE; pwvbufs[1]=pwvbufs[0]+Y_SIZE;
    pwvbufs[2]=pwvbufs[1]+Y_SIZE;     pwvbufs[3]=horzcoefs;
	plopass=wcoefs;                 phipass=wcoefs+(nlr>>1)*Y_SIZE;    
    wave_vert(pwvbufs, 
              d4_decomp_lp_Q15, d4_decomp_hp_Q15, 
              plopass, phipass, 
              nlc);
    plopass += Y_SIZE; phipass += Y_SIZE;

    /* 2nd time throw last row and 1st three rows */
    pwvbufs[0]=horzcoefs+(nrow+2)*Y_SIZE;    pwvbufs[1] = horzcoefs;
    pwvbufs[2]=pwvbufs[1]+Y_SIZE;      pwvbufs[3]=pwvbufs[2]+Y_SIZE;    
	wave_vert(pwvbufs, 
              d4_decomp_lp_Q15, d4_decomp_hp_Q15, 
              plopass, phipass, 
              nlc);
              plopass += Y_SIZE; phipass += Y_SIZE;
	
    /* and the rest of 'em through this loop */
    scol = 1; /* start column */
    for (ii=0; ii<(nlr>>1)-2; ++ii, plopass+=Y_SIZE, phipass+=Y_SIZE) {
      pwvbufs[0] = horzcoefs+(scol*Y_SIZE);
      pwvbufs[1] = pwvbufs[0]+Y_SIZE;
      pwvbufs[2] = pwvbufs[1]+Y_SIZE;
      pwvbufs[3] = pwvbufs[2]+Y_SIZE;
      wave_vert(pwvbufs, 
                d4_decomp_lp_Q15, d4_decomp_hp_Q15, 
                plopass, phipass, 
                nlc);
      scol += 2;
    }
    
  } /* end (for each wavelet decomposition level) */
}

void idwt2d(int nLevels)
{
  short d4_synth_lp_Q15[]  = {-4240,7345,27411,15826},
        d4_synth_hp_Q15[] = {-15826,27411,-7345,-4240};

  int nlr = X_SIZE>>(nLevels-1), /* # rows in current level */
      nlc = Y_SIZE>>(nLevels-1), /* # cols in current level */
      ilevel, irow, ii;

  /* for invwave_vert */      
  short *pwvbufs[4];
  short *pidwt1, *pidwt2, *pnext;
      
  for (ilevel=0; ilevel<nLevels; ++ilevel, nlr<<=1, nlc<<=1) {

    /* 
     * first perform the vertical transform, 2D DWT coefficients
     * are in the wcoefs buffer, send inverse into horzcoefs
     */
     
    /* 
     * periodization: 1st time through input 
     * rows 1, 2, half-way, half-way+1 
     */     
    pwvbufs[0]=wcoefs;           pwvbufs[2]=wcoefs+(nlr>>1)*Y_SIZE;
	pwvbufs[1]=pwvbufs[0]+Y_SIZE;     pwvbufs[3]=pwvbufs[2]+Y_SIZE;
	pidwt1 = horzcoefs+(nlr-1)*Y_SIZE;          pidwt2 = horzcoefs;
    invwave_vert(pwvbufs, 
	             d4_synth_lp_Q15, d4_synth_hp_Q15, 
	             pidwt1, pidwt2, 
	             nlc);
	
	/* 
	 * interior portion of the vertical convolutations with 
	 * no periodization effects 
	 */             
	pidwt1=horzcoefs+Y_SIZE; pidwt2=pidwt1+Y_SIZE;
	for (ii=0; ii<nlr-2; 
	     ii+=2, pidwt1+=(Y_SIZE<<1), pidwt2+=(Y_SIZE<<1)) {
	  
	  /* shift 1st two up by 1 and fetch next row */   
      pnext = pwvbufs[1]+Y_SIZE;
      pwvbufs[0]=pwvbufs[1];  pwvbufs[1]=pnext;
      /* shift 2nd two up by 1 and fetch next row */
      pnext = pwvbufs[3]+Y_SIZE;
      pwvbufs[2]=pwvbufs[3]; pwvbufs[3]=pnext;
      
      invwave_vert(pwvbufs, 
                   d4_synth_lp_Q15, d4_synth_hp_Q15, 
                   pidwt1, pidwt2, 
                   nlc);
    
    }

    /* 
     * periodization: last time input rows are
     * half-way point and 1st, for each of the 
     * two sections of wcoefs pointed to by pwvbufs. 
     */ 
    pwvbufs[0]=pwvbufs[1]-Y_SIZE;     pwvbufs[2]=pwvbufs[3]-Y_SIZE;
    pwvbufs[1]=wcoefs;           pwvbufs[3]=wcoefs+(nlr>>1)*Y_SIZE;
    invwave_vert(pwvbufs, 
                 d4_synth_lp_Q15, d4_synth_hp_Q15, 
                 pidwt1-(Y_SIZE<<1), pidwt2-(Y_SIZE<<1), 
                 nlc);
    
    /* 
     * done with vertical inverse transform, 
     * horizontal direction much simpler. 
     */             
    for (irow=0; irow<nlr; ++irow)
      invwave_horz(horzcoefs+irow*Y_SIZE, 
                   d4_synth_lp_Q15, d4_synth_hp_Q15, 
                   wcoefs+irow*Y_SIZE, 
                   nlc);
    
  } /* end (for each wavelet scale) */
}

int main(void)
{
  int levels = 3;
  DSK6416_init(); /* initialize the DSK board support library */
  
  start = clock(); /* calculate overhead of calling clock*/
  stop = clock();  /* and subtract this value from The results*/
  overhead = stop - start;  
  
  dwt2d(levels);
  idwt2d(levels);
}
