#define CHIP_6416
#include <dsk6416.h>
#include <stdio.h> /* printf() */
#include <string.h> /* memset() */
#include <img_conv_3x3.h>
#include <csl_timer.h>
#include <csl_dat.h> /* DMA */

#include "..\image.h" /* image\kernel dimensions, example pixel data */
#pragma DATA_ALIGN (in_img, 8);
#pragma DATA_SECTION (in_img, "SDRAM");

#pragma DATA_ALIGN (out_img, 8);
#pragma DATA_SECTION (out_img, "SDRAM"); 
unsigned char out_img[N_PIXELS]; /* filtered image */

/* filter dimensions and coefficients */
#define NH 3 /* kernel is of size NHxNH (needs to be 3 for this program) */
#define BOUNDARY (NH/2) /* 1st and last BOUNDARY rows/cols in output set to 0 */
#pragma DATA_ALIGN (H, 8)
char H[NH*NH] = {
	1, 2, 1, /* 1/16 2/16 1/16 */
	2, 4, 2, /* 2/16 4/16 2/16 */
	1, 2, 1, /* 1/16 2/16 1/16 */
};
#define SHIFT 4 /* right-shift by 4 (div by 16) */

#define N_COLS_FILTERED Y_SIZE-2*BOUNDARY

/* 
 * For block processing, segment image into individual chunks
 * that are then paged in and out of internal memory.
 */
#define NUM_SCAN_LINES   16 /* # rows constituting a block, must divide evenly into X_SIZE */
#define NUM_BLOCKS       (X_SIZE/NUM_SCAN_LINES) /* # blocks in partitioned image */
#define BLOCK_X_SIZE     (NUM_SCAN_LINES+2*BOUNDARY) /* how many rows each block is */

/* 
 * These are scratch buffers, strategically placed in on-chip RAM: 
 *
 * input_buf = input pixels, passed to IMG_conv_3x3
 * output_buf = filtered pixels 
 */
#pragma DATA_ALIGN (input_buf, 8);
unsigned char input_buf[BLOCK_X_SIZE*Y_SIZE];
#pragma DATA_ALIGN (output_buf, 8);
/* 
 * NOTE: pad output_buf with 2*BOUNDARY pixels because
 * IMG_conv_3x3 required # cols arg to be multiple of 8.
 * If this wasn't done you'd write past the end of the array.
 */
unsigned char output_buf[NUM_SCAN_LINES*Y_SIZE + 2*BOUNDARY];

/* 
 * Faster than memset(), count must be a multiple of  
 * 8 and greater than or equal to 32
 */
void memclear( void * ptr, int count )
{
  long * lptr = ptr;
  _nassert((int)lptr%8==0);
  #pragma MUST_ITERATE (32);
  for (count>>=3; count>0; count--)
    *lptr++ = 0;
}

/* Filter one block of an image, returns row index for next block */
int filter_block(int irow, int nrows, 
                 const unsigned char *restrict pin, 
                 unsigned char *restrict pout)
{
  int jj = irow, kk;
  for (; jj<irow+nrows; ++jj) {

    /* 1st few cols are 0 */		
    for (kk=0; kk<BOUNDARY; ++kk) *pout++ = 0;

    /* 
     * Even though we only care about N_COLS_FILTERED pixels, 
     * in pass Y_SIZE because this function requires # pixels
     * to be filtered be a multiple of 8.
     */
    IMG_conv_3x3(pin, pout, Y_SIZE, H, SHIFT);

    /* last few cols are 0 */
    pout += N_COLS_FILTERED;
    for (kk=0; kk<BOUNDARY; ++kk) *pout++ = 0;
		
    pin += Y_SIZE; /* incr scan-line in preparation for next iteration */
  }
  
  return jj-1;
}

/* March down the image block-by-block, filtering along the way */
void filter_image()
{
  Uint32  id_EDMAin  = DAT_XFRID_WAITNONE,
          id_EDMAout = DAT_XFRID_WAITNONE;
  int irow = BOUNDARY;
  unsigned char *pout_img = out_img,
                *pin_img = &in_img[(NUM_SCAN_LINES+BOUNDARY)*Y_SIZE],
	            /*
	             * We reuse the bottom-most portion of input_buf
	             * by shifting it to the top, prior to the beginning 
	             * of the subsequent block filtering.  The 1st time 
	             * through the "interior of the image" loop, the
	             * the pointer to bottom-most portion (pinput_buf_row2move_1)
	             * is different from the subsequent iterations
	             * (pinput_buf_row2move_n).
	             */
				*pinput_buf_row2move_1 = input_buf + (BLOCK_X_SIZE-3*BOUNDARY)*Y_SIZE,
				*pinput_buf_row2move_n = input_buf + NUM_SCAN_LINES*Y_SIZE,
				*pinput_buf_row2move = pinput_buf_row2move_1,
				*pinput_buf_row2copy_into = input_buf+2*BOUNDARY*Y_SIZE;
			
  /**************************************************************
   * Algorithm 'prologue': filter the 1st block
   **************************************************************/
  id_EDMAin = DAT_copy(in_img, input_buf, (BLOCK_X_SIZE-BOUNDARY)*Y_SIZE);
  memclear(output_buf, BOUNDARY*Y_SIZE); /* 1st few rows are 0 */
  DAT_wait(id_EDMAin);
    
  irow = filter_block(irow, 
                      NUM_SCAN_LINES, 
                      input_buf, 
                      output_buf + BOUNDARY*Y_SIZE);

  /**************************************************************
   * Filter the interior of the image
   **************************************************************/

  for (; irow<X_SIZE-NUM_SCAN_LINES; ++irow) {
  
    /* page out the most recently processed block */
    id_EDMAout = DAT_copy(output_buf, pout_img, NUM_SCAN_LINES*Y_SIZE);
    pout_img += NUM_SCAN_LINES*Y_SIZE;

    /* page in the next block of pixel data */
    
    /* 
     * 1st shift the scan-lines we can reuse from the bottom 
     * to the top of the scratch buffer.
     */
    memcpy(input_buf, pinput_buf_row2move, 2*BOUNDARY*Y_SIZE);
    pinput_buf_row2move = pinput_buf_row2move_n;
    
    /* DMA in next set of scan-lines */
    id_EDMAin = DAT_copy(pin_img, pinput_buf_row2copy_into, NUM_SCAN_LINES*Y_SIZE);
    pin_img += NUM_SCAN_LINES*Y_SIZE;

    /* gotta wait now for both xfers to complete before proceeding */
    DAT_wait(id_EDMAout);
    DAT_wait(id_EDMAin);
    irow = filter_block(irow, NUM_SCAN_LINES, input_buf, output_buf);      
  }

  /**************************************************************
   * Algorithm 'epilogue': filter the last block
   **************************************************************/
  
  /* page out the most recently processed block of image data */
  id_EDMAout = DAT_copy(output_buf, pout_img, NUM_SCAN_LINES*Y_SIZE);
  pout_img += (NUM_SCAN_LINES)*Y_SIZE;

  /* page in the last block of data */
  memcpy(input_buf, pinput_buf_row2move, 2*BOUNDARY*Y_SIZE); /* shift scan-lines */
  id_EDMAin = DAT_copy(pin_img, pinput_buf_row2copy_into, (NUM_SCAN_LINES-BOUNDARY)*Y_SIZE);

  /* gotta wait now for both xfers to complete before proceeding */
  DAT_wait(id_EDMAout);
  DAT_wait(id_EDMAin);
  filter_block(irow, NUM_SCAN_LINES-BOUNDARY, input_buf, output_buf);

  /* last few rows are zero */
  memclear(output_buf + (NUM_SCAN_LINES-BOUNDARY)*Y_SIZE, BOUNDARY*Y_SIZE);

  /* we're done, page out this final block of pixel data */
  id_EDMAout = DAT_copy(output_buf, pout_img, NUM_SCAN_LINES*Y_SIZE);
  DAT_wait(id_EDMAout);
}

int main(void)
{
  TIMER_Handle hTimer;
  unsigned int start, stop, overhead, total = 0, t; /* timing */
  const int N = 10; /* how many times to profile */
  int ii = 0;
      
  DSK6416_init(); /* initialize the DSK board support library */
  
  /* configure timer */
  hTimer = TIMER_open(TIMER_DEVANY,0); 
  TIMER_configArgs(hTimer, 0x000002C0, 0xFFFFFFFF, 0x00000000);  
  
  /* initialize EDMA (1st arg ignored w/ EDMA) */
  DAT_open(DAT_CHAANY, DAT_PRI_HIGH, 0);
  
  /* compute overhead of calling the timer. */  
  start    = TIMER_getCount(hTimer);  /* called twice to avoid L1D miss.  */
  start    = TIMER_getCount(hTimer); 
  stop     = TIMER_getCount(hTimer); 
  overhead = stop - start;

  for (; ii<N; ++ii) {
    start = clock(); /* begin "profile area" */
      filter_image();
    stop = clock(); /* end "profile area" */
    t = (stop-start-overhead) * 8;
    total += t;
    printf("# cycles to filter image: %d\n", t);
  }
  
  printf("avg time is %.2f cycles.\n", (float)total/(float)N);
}
