#define CHIP_6416
#include <dsk6416.h>
#include <string.h> /* memset() */
#include <stdio.h>
#include <time.h> /* for clock() */
#include "wave_edge.h"

#pragma DATA_SECTION(out_img, "SDRAM");
#pragma DATA_SECTION(in_img, "SDRAM");

/*
 * image subtraction as above, but shifted back down to 8 bpp
 * range, and returns the mean of |detail|.
 */
unsigned int calc_detail(int *p_approx1, int *p_approx2, int *p_detail)
{
  int *pa1 = p_approx1 + (Y_SIZE<<1) + 2,
      *pa2 = p_approx2 + (Y_SIZE<<1) + 2,
      *pd  = p_detail + (Y_SIZE<<1) + 2;
  int irow, jcol;
  unsigned int sumlo = 0, sumhi = 0;

  for (irow=2; irow<(X_SIZE-2); ++irow) {
    for (jcol=2; jcol<(Y_SIZE-2); jcol+=2) {
      pd[0] = /* |pa2[0]-pa1[0]| >> 9 */
        _sshvr(_abs(_lo(_amemd8(pa2))-_lo(_amemd8(pa1))),9);
      sumlo += pd[0];
      pd[1] = /* |pa2[1]-pa1[1]| >> 9 */
        _sshvr(_abs(_hi(_amemd8(pa2))-_hi(_amemd8(pa1))),9);
      sumhi += pd[1];
      pa1 += 2;
      pa2 += 2;
      pd += 2;           
    }
    pa1 += 4;
    pa2 += 4;
    pd += 4;              
  }
  return (sumlo+sumhi)>>LOG2_N_PIXELS;
}

int wave_decomp_a_trous(int nlevels)
{
  int ilevel, irow, jcol, kk,
      cpi; /* "cp" = center pixel index */
  int *p_approx1 = approx1,
      *p_approx2 = approx2,
      *p;

  for (ilevel=0; ilevel<nlevels; ++ilevel) {
    /* 
     * zero out the margins of the current 
     * approximation coefficient matrix -
     * 1st and last couple of rows
     */
    p = p_approx2;
    for (kk = 0; kk < Y_SIZE; kk+=2)
      _amemd8(&p[kk]) = 
      _amemd8(&p[kk+Y_SIZE]) = 
      _amemd8(&p[kk+N_PIXELS-(Y_SIZE<<1)]) = 
      _amemd8(&p[kk+N_PIXELS-Y_SIZE]) = 0;
    
    /* 
     * first and last couple of columns (just
     * the next 2 rows, the remainder are done
     * in the main convolution loop)
     */
    p += (Y_SIZE<<1);
    _amemd8(p) = 0;
    _amemd8(p+Y_SIZE-2) = 0;
    p += Y_SIZE;
    _amemd8(p) = 0;
    _amemd8(p+Y_SIZE-2) = 0;
    p += Y_SIZE;    
    
    /* 5x5 convolution for approximation coeffs at level i */
    for (irow=2; irow<(X_SIZE-2); ++irow) {
    
      /* clearing out 1st & last two cols */
      _amemd8(p) = 0;
      _amemd8(p+Y_SIZE-2) = 0;
      p += Y_SIZE;    
      
      for (jcol=2; jcol<(Y_SIZE-2); ++jcol) {

         cpi = (irow<<LOG2_Y_SIZE) + jcol;
                          /* row 1: 1/256 1/64 3/128 1/64 1/256 */
         p_approx2[cpi] = (p_approx1[cpi-(Y_SIZE<<1)-2]>>8) +
                          (p_approx1[cpi-(Y_SIZE<<1)-1]>>6) +
                          ((3*p_approx1[cpi-(Y_SIZE<<1)])>>7) +
                          (p_approx1[cpi-(Y_SIZE<<1)+1]>>6) +
                          (p_approx1[cpi-(Y_SIZE<<1)+2]>>8) +
                          /* row 2: 1/64 1/16 3/32 1/16 1/64 */
                          (p_approx1[cpi-Y_SIZE-2]>>6) +
                          (p_approx1[cpi-Y_SIZE-1]>>4) +
                          ((3*p_approx1[cpi-Y_SIZE])>>5) +
                          (p_approx1[cpi-Y_SIZE+1]>>4) +
                          (p_approx1[cpi-Y_SIZE+2]>>6) +
                          /* row 3: 3/128 3/32 9/64 3/32 3/128 */
                          ((3*p_approx1[cpi-2])>>7) +
                          ((3*p_approx1[cpi-1])>>5) +
                          ((9*p_approx1[cpi])>>6) +
                          ((3*p_approx1[cpi+1])>>5) +
                          ((3*p_approx1[cpi+2])>>7) +
                          /* row 4: 1/64 1/16 3/32 1/16 1/64 */
                          (p_approx1[cpi+Y_SIZE-2]>>6) +
                          (p_approx1[cpi+Y_SIZE-1]>>4) +
                          ((3*p_approx1[cpi+Y_SIZE])>>5) +
                          (p_approx1[cpi+Y_SIZE+1]>>4) +
                          (p_approx1[cpi+Y_SIZE+2]>>6) +
                          /* row 5: 1/256 1/64 3/128 1/64 1/256 */ 
                          (p_approx1[cpi+(Y_SIZE<<1)-2]>>8) +
                          (p_approx1[cpi+(Y_SIZE<<1)-1]>>6) +
                          ((3*p_approx1[cpi+(Y_SIZE<<1)])>>7) +
                          (p_approx1[cpi+(Y_SIZE<<1)+1]>>6) +
                          (p_approx1[cpi+(Y_SIZE<<1)+2]>>8); 
                                                                            
      }
    }
    
    /* prep for next iteration, 1st swap pointers */
    p = p_approx2;
    p_approx2 = p_approx1;
    p_approx1 = p;
    
  } /* end (for each decomposition level) */
  
  return calc_detail(p_approx1, p_approx2, detail); 
}

void segment_detail_image(int mean)
{
  int cp; /* cp = center pixel */
  const int M = 3640; /* 1/9 in Q.15 format */
  unsigned long avg;
  int irow, jcol;
  /* pointers to 3 scan-lines for 3x3 filtering */
  int *pd1 = detail+(2*Y_SIZE+2), /* detail row 1 */
      *pd2 = detail+(3*Y_SIZE+2), /* detail row 2 */
      *pd3 = detail+(4*Y_SIZE+2); /* detail row 3 */
  unsigned char *pout = out_img+(3*Y_SIZE+3); /* output img */

  /* 
   * mark those pixels where the detail coefficient
   * is greater than the local mean AND where it is greater
   * than the global mean as an edge.
   */
  for (irow=3; irow<(X_SIZE-3); ++irow, pout+=6) {
    for (jcol=3; jcol<(Y_SIZE-3); ++jcol, ++pd1, ++pd2, ++pd3) {
      cp = pd2[1];
      /* find local average using 3x3 averaging filter */ 
      avg = ( pd1[0]*M + pd1[1]*M + pd1[2]*M +
              pd2[0]*M + cp*M     + pd2[2]*M +
              pd3[0]*M + pd3[1]*M + pd3[2]*M ) >> 15;
      *pout++ = (cp>avg && cp>mean) ? 255 : 0;
    }
    /* move pointers over to start of the next row */
    pd1 += 6; pd2 += 6; pd3 += 6;
  }
}

int main(void)
{
  const int nlevels = 2;
  int ii=0;
  int mean;
  clock_t start, stop, overhead; /* timing */
  
  DSK6416_init(); /* initialize the DSK board support library */
  start = clock(); /* calculate overhead of calling clock*/
  stop = clock();  /* and subtract this value from The results*/
  overhead = stop - start;  

  _nassert((int)approx1%8 == 0);
  _nassert((int)in_img%8 == 0);
  for (; ii<N_PIXELS; ++ii)
      /* scale input for fixed-point arithmetic */
	  approx1[ii] = (in_img[ii] << 9);  
      
  start = clock(); /* begin "profile area" */
  mean = wave_decomp_a_trous(nlevels);
  segment_detail_image(mean);
  stop = clock(); /* end "profile area" */
  printf("# cycles for %d-level wavelet edge detection: %d\n", nlevels, stop-start-overhead);
}
