#define CHIP_6416
#include <dsk6416.h>
#include <stdio.h> /* printf() */
#include <string.h> /* memset() */
#include <csl_timer.h>
#include <csl_dat.h> /* DMA */

#include "..\image.h" /* image\kernel dimensions, example pixel data */
#pragma DATA_ALIGN (in_img, 8);
#pragma DATA_SECTION (in_img, "SDRAM");

#pragma DATA_ALIGN (out_img, 8);
#pragma DATA_SECTION (out_img, "SDRAM"); 
unsigned char out_img[N_PIXELS]; /* filtered image */

/* filter dimensions and coefficients */
#define NH 5 /* kernel is of size NHxNH (needs to be 5 for this program) */
#define BOUNDARY 2

/* 
 * For block processing, segment image into individual chunks
 * that are then paged in and out of internal memory.
 */
#define NUM_SCAN_LINES   16 /* # rows constituting a block, must divide evenly into X_SIZE */
#define NUM_BLOCKS       (X_SIZE/NUM_SCAN_LINES) /* # blocks in partitioned image */
#define BLOCK_X_SIZE     (NUM_SCAN_LINES+2*BOUNDARY) /* how many rows each block is */

/* 
 * These are scratch buffers, strategically placed in on-chip RAM: 
 *
 * input_buf = input pixels
 * output_buf = filtered pixels 
 */
#pragma DATA_ALIGN (input_buf, 8);
unsigned char input_buf[BLOCK_X_SIZE*Y_SIZE];
#pragma DATA_ALIGN (output_buf, 8);
unsigned char output_buf[NUM_SCAN_LINES*Y_SIZE];

/* 
 * Faster than memset(), count must be a multiple of  
 * 8 and greater than or equal to 32
 */
void memclear( void * ptr, int count )
{
  long * lptr = ptr;
  _nassert((int)lptr%8==0);
  #pragma MUST_ITERATE (32);
  for (count>>=3; count>0; count--)
    *lptr++ = 0;
}

/* 
 * Filter one block of an image, returns row index for next block 
 */

/* N holds the current neighborhood of 5x5 pixels */
#pragma DATA_ALIGN (N, 8);
unsigned char N[NH*NH];

/* definitions needed for partial sorting routine */
#define PIX_SWAP(a,b) { unsigned char temp=(a);(a)=(b);(b)=temp; }
#define PIX_SORT(a,b) { if ((a)>(b)) PIX_SWAP((a),(b)); }

int filter_block(int irow, int nrows, 
                 const unsigned char *restrict pin, 
                 unsigned char *restrict pout)
{
  int ii=irow, jj;
  for (; ii<irow+nrows; ++ii) {
    pout += BOUNDARY;
    for (jj=BOUNDARY; jj<Y_SIZE-BOUNDARY; ++jj, pin++, pout++) {
	  // splice out the current neighborhood
	  memcpy(N, pin, 5*sizeof(unsigned char));
	  memcpy(&N[5], pin+Y_SIZE, 5*sizeof(unsigned char));
	  memcpy(&N[10], pin+2*Y_SIZE, 5*sizeof(unsigned char));
	  memcpy(&N[15], pin+3*Y_SIZE, 5*sizeof(unsigned char));
	  memcpy(&N[20], pin+4*Y_SIZE, 5*sizeof(unsigned char));
	  
	  // optimized search for the median value in a list of 25 elements,
	  // again in theory it is impossible to do this with fewer comparisons
	  PIX_SORT(N[0], N[1]) ;   PIX_SORT(N[3], N[4]) ;   PIX_SORT(N[2], N[4]) ;
	  PIX_SORT(N[2], N[3]) ;   PIX_SORT(N[6], N[7]) ;   PIX_SORT(N[5], N[7]) ;
	  PIX_SORT(N[5], N[6]) ;   PIX_SORT(N[9], N[10]) ;  PIX_SORT(N[8], N[10]) ;
	  PIX_SORT(N[8], N[9]) ;   PIX_SORT(N[12], N[13]) ; PIX_SORT(N[11], N[13]) ;
	  PIX_SORT(N[11], N[12]) ; PIX_SORT(N[15], N[16]) ; PIX_SORT(N[14], N[16]) ;
	  PIX_SORT(N[14], N[15]) ; PIX_SORT(N[18], N[19]) ; PIX_SORT(N[17], N[19]) ;
  	  PIX_SORT(N[17], N[18]) ; PIX_SORT(N[21], N[22]) ; PIX_SORT(N[20], N[22]) ;
	  PIX_SORT(N[20], N[21]) ; PIX_SORT(N[23], N[24]) ; PIX_SORT(N[2], N[5]) ;
	  PIX_SORT(N[3], N[6]) ;   PIX_SORT(N[0], N[6]) ;   PIX_SORT(N[0], N[3]) ;
	  PIX_SORT(N[4], N[7]) ;   PIX_SORT(N[1], N[7]) ;   PIX_SORT(N[1], N[4]) ;
	  PIX_SORT(N[11], N[14]) ; PIX_SORT(N[8], N[14]) ;  PIX_SORT(N[8], N[11]) ;
	  PIX_SORT(N[12], N[15]) ; PIX_SORT(N[9], N[15]) ;  PIX_SORT(N[9], N[12]) ;
	  PIX_SORT(N[13], N[16]) ; PIX_SORT(N[10], N[16]) ; PIX_SORT(N[10], N[13]) ;
	  PIX_SORT(N[20], N[23]) ; PIX_SORT(N[17], N[23]) ; PIX_SORT(N[17], N[20]) ;
	  PIX_SORT(N[21], N[24]) ; PIX_SORT(N[18], N[24]) ; PIX_SORT(N[18], N[21]) ;
	  PIX_SORT(N[19], N[22]) ; PIX_SORT(N[8], N[17]) ;  PIX_SORT(N[9], N[18]) ;
	  PIX_SORT(N[0], N[18]) ;  PIX_SORT(N[0], N[9]) ;   PIX_SORT(N[10], N[19]) ;
	  PIX_SORT(N[1], N[19]) ;  PIX_SORT(N[1], N[10]) ;  PIX_SORT(N[11], N[20]) ;
	  PIX_SORT(N[2], N[20]) ;  PIX_SORT(N[2], N[11]) ;  PIX_SORT(N[12], N[21]) ;
	  PIX_SORT(N[3], N[21]) ;  PIX_SORT(N[3], N[12]) ;  PIX_SORT(N[13], N[22]) ;
 	  PIX_SORT(N[4], N[22]) ;  PIX_SORT(N[4], N[13]) ;  PIX_SORT(N[14], N[23]) ;
	  PIX_SORT(N[5], N[23]) ;  PIX_SORT(N[5], N[14]) ;  PIX_SORT(N[15], N[24]) ;
	  PIX_SORT(N[6], N[24]) ;  PIX_SORT(N[6], N[15]) ;  PIX_SORT(N[7], N[16]) ;
	  PIX_SORT(N[7], N[19]) ;  PIX_SORT(N[13], N[21]) ; PIX_SORT(N[15], N[23]) ;
	  PIX_SORT(N[7], N[13]) ;  PIX_SORT(N[7], N[15]) ;  PIX_SORT(N[1], N[9]) ;
	  PIX_SORT(N[3], N[11]) ;  PIX_SORT(N[5], N[17]) ;  PIX_SORT(N[11], N[17]) ;
	  PIX_SORT(N[9], N[17]) ;  PIX_SORT(N[4], N[10]) ;  PIX_SORT(N[6], N[12]) ;
	  PIX_SORT(N[7], N[14]) ;  PIX_SORT(N[4], N[6]) ;   PIX_SORT(N[4], N[7]) ;
	  PIX_SORT(N[12], N[14]) ; PIX_SORT(N[10], N[14]) ; PIX_SORT(N[6], N[7]) ;
	  PIX_SORT(N[10], N[12]) ; PIX_SORT(N[6], N[10]) ;  PIX_SORT(N[6], N[17]) ;
	  PIX_SORT(N[12], N[17]) ; PIX_SORT(N[7], N[17]) ;  PIX_SORT(N[7], N[10]) ;
 	  PIX_SORT(N[12], N[18]) ; PIX_SORT(N[7], N[12]) ;  PIX_SORT(N[10], N[18]) ;
	  PIX_SORT(N[12], N[20]) ; PIX_SORT(N[10], N[20]) ; PIX_SORT(N[10], N[12]) ;

	  *pout = N[12];
	  
    }
    pin += 2*BOUNDARY; /* incr scan-lines in preparation for next iteration */
    pout += BOUNDARY;
  }
  
  return ii-1;
}

/* March down the image block-by-block, filtering along the way */
void filter_image()
{
  Uint32  id_EDMAin  = DAT_XFRID_WAITNONE,
          id_EDMAout = DAT_XFRID_WAITNONE;
  int irow = BOUNDARY;
  unsigned char *pout_img = out_img,
                *pin_img = &in_img[(NUM_SCAN_LINES+BOUNDARY)*Y_SIZE],
	            /*
	             * We reuse the bottom-most portion of input_buf
	             * by shifting it to the top, prior to the beginning 
	             * of the subsequent block filtering.  The 1st time 
	             * through the "interior of the image" loop, the
	             * the pointer to bottom-most portion (pinput_buf_row2move_1)
	             * is different from the subsequent iterations
	             * (pinput_buf_row2move_n).
	             */
				*pinput_buf_row2move_1 = input_buf + (BLOCK_X_SIZE-3*BOUNDARY)*Y_SIZE,
				*pinput_buf_row2move_n = input_buf + NUM_SCAN_LINES*Y_SIZE,
				*pinput_buf_row2move = pinput_buf_row2move_1,
				*pinput_buf_row2copy_into = input_buf+2*BOUNDARY*Y_SIZE;
			
  /**************************************************************
   * Algorithm 'prologue': filter the 1st block
   **************************************************************/
  id_EDMAin = DAT_copy(in_img, input_buf, (BLOCK_X_SIZE-BOUNDARY)*Y_SIZE);
  memclear(output_buf, BOUNDARY*Y_SIZE); /* 1st few rows are 0 */
  DAT_wait(id_EDMAin);
    
  irow = filter_block(irow, 
                      NUM_SCAN_LINES, 
                      input_buf, 
                      output_buf + BOUNDARY*Y_SIZE);

  /**************************************************************
   * Filter the interior of the image
   **************************************************************/

  for (; irow<X_SIZE-NUM_SCAN_LINES; ++irow) {
  
    /* page out the most recently processed block */
    id_EDMAout = DAT_copy(output_buf, pout_img, NUM_SCAN_LINES*Y_SIZE);
    pout_img += NUM_SCAN_LINES*Y_SIZE;

    /* page in the next block of pixel data */
    
    /* 
     * 1st shift the scan-lines we can reuse from the bottom 
     * to the top of the scractch buffer.
     */
    memcpy(input_buf, pinput_buf_row2move, 2*BOUNDARY*Y_SIZE);
    pinput_buf_row2move = pinput_buf_row2move_n;
    
    /* DMA in next set of scan-lines */
    id_EDMAin = DAT_copy(pin_img, pinput_buf_row2copy_into, NUM_SCAN_LINES*Y_SIZE);
    pin_img += NUM_SCAN_LINES*Y_SIZE;

    /* gotta wait now for both xfers to complete before proceeding */
    DAT_wait(id_EDMAout);
    DAT_wait(id_EDMAin);
    irow = filter_block(irow, NUM_SCAN_LINES, input_buf, output_buf);      
  }

  /**************************************************************
   * Algorithm 'epilogue': filter the last block
   **************************************************************/
  
  /* page out the most recently processed block of image data */
  id_EDMAout = DAT_copy(output_buf, pout_img, NUM_SCAN_LINES*Y_SIZE);
  pout_img += (NUM_SCAN_LINES)*Y_SIZE;

  /* page in the last block of data */
  memcpy(input_buf, pinput_buf_row2move, 2*BOUNDARY*Y_SIZE); /* shift scan-lines */
  id_EDMAin = DAT_copy(pin_img, pinput_buf_row2copy_into, (NUM_SCAN_LINES-BOUNDARY)*Y_SIZE);

  /* gotta wait now for both xfers to complete before proceeding */
  DAT_wait(id_EDMAout);
  DAT_wait(id_EDMAin);
  filter_block(irow, NUM_SCAN_LINES-BOUNDARY, input_buf, output_buf);

  /* last few rows are zero */
  memclear(output_buf + (NUM_SCAN_LINES-BOUNDARY)*Y_SIZE, BOUNDARY*Y_SIZE);

  /* we're done, page out this final block of pixel data */
  id_EDMAout = DAT_copy(output_buf, pout_img, NUM_SCAN_LINES*Y_SIZE);
  DAT_wait(id_EDMAout);
}

int main(void)
{
  TIMER_Handle hTimer;
  unsigned int start, stop, overhead, total = 0, t; /* timing */
  const int N = 10; /* how many times to profile */
  int ii = 0;
      
  DSK6416_init(); /* initialize the DSK board support library */
  
  /* configure timer */
  hTimer = TIMER_open(TIMER_DEVANY,0); 
  TIMER_configArgs(hTimer, 0x000002C0, 0xFFFFFFFF, 0x00000000);  
  
  /* initialize EDMA (1st arg ignored w/ EDMA) */
  DAT_open(DAT_CHAANY, DAT_PRI_HIGH, 0);
  
  /* compute overhead of calling the timer. */  
  start    = TIMER_getCount(hTimer);  /* called twice to avoid L1D miss.  */
  start    = TIMER_getCount(hTimer); 
  stop     = TIMER_getCount(hTimer); 
  overhead = stop - start;

  for (; ii<N; ++ii) {
    start = clock(); /* begin "profile area" */
      filter_image();
    stop = clock(); /* end "profile area" */
    t = (stop-start-overhead) * 8;
    total += t;
    printf("# cycles to filter image: %d\n", t);
  }
  
  printf("avg time is %.2f cycles.\n", (float)total/(float)N);
}
