#include <board.h> /* EVM library */
#include <stdio.h> /* printf() */
#include <dsp_fir_gen.h>
#include <dsp_blk_move.h>
#include <string.h> /* memset() & memcpy() */
#include <time.h> /* clock() */

#include "..\image.h" /* image\kernel dimensions, example pixel data */
#pragma DATA_ALIGN (in_img, 4);
#pragma DATA_SECTION (in_img, "SBSRAM");

#pragma DATA_ALIGN (out_img, 4);
#pragma DATA_SECTION (out_img, "SBSRAM"); 
unsigned char out_img[N_PIXELS]; /* filtered image */

/* constants that define how we march through the image */
#define BOUNDARY (NH/2) /* 1st and last BOUNDARY rows/cols in output set to 0 */
#define NUM_SCAN_LINES   16 /* must divide evenly into X_SIZE */
#define NUM_BLOCKS       (X_SIZE/NUM_SCAN_LINES) /* # partitions */
#define BLOCK_X_SIZE     (NUM_SCAN_LINES+BOUNDARY) /* how many rows each block is */
#define FIR_BLOCK_X_SIZE (NUM_SCAN_LINES+2*BOUNDARY) /* how many rows to pass thru filter */

/* filter coefficients in Q15 format */
short h[NH] = {1310, 1310, 1310, 1310, 1310}; /* (1/25 * 2^15) */
short *pFirBand[NH]; /* ptrs into the NH filtered rows */

/* These are scratch buffers, strategically placed in on-chip RAM: 
 *
 * input_buf = input pixels about to be passed through FIR filter
 * fir_buf = output of FIR filter placed in this buffer
 * output_buf = filtered pixels go here
 *
 */
#pragma DATA_ALIGN (input_buf, 4);
short input_buf[BLOCK_X_SIZE*Y_SIZE + NH-1];
#pragma DATA_ALIGN (fir_buf, 4);
short fir_buf[FIR_BLOCK_X_SIZE*Y_SIZE + NH-1];
#pragma DATA_ALIGN (output_buf, 4);
unsigned char output_buf[NUM_SCAN_LINES*Y_SIZE];

void filter_image()
{
  int ii, jj, sum,
      irow = BOUNDARY, /* ptr to curr row, starts @ BOUNDARY to provide margin */
      ifir = 0, /* ptr to filtered row of pixels */ 
      nfir2copy = 2*BOUNDARY*Y_SIZE; /* # filtered pixels to copy during transition to next block */
  unsigned char *p = output_buf + BOUNDARY*Y_SIZE;

  /* algorithm 'prologue': move 1st block into on-chip RAM and filter */
  DSP_blk_move(in_img, input_buf, BLOCK_X_SIZE*Y_SIZE);
  DSP_fir_gen(input_buf, h, fir_buf, NH, BLOCK_X_SIZE*Y_SIZE);
  
  memset(output_buf, 0, BOUNDARY*Y_SIZE); /* 1st BOUNDARY rows are zero */

  /* algorithm main loop: filter the individual blocks */
  for (; irow<Y_SIZE-BOUNDARY; ++irow)
  {
  
    if (0 == irow%NUM_SCAN_LINES)
	{
	  /* 
	   * We just hit the start of the next block, so here we:
	   *
	   * o copy the next block's worth of pixels into working buffer
	   * o run the rows through the FIR filter
	   * o copy the last block's worth of filtered pixels into output
	   *   array
	   * o do some prep work for the rest of the loop
	   */
	  int d = irow/NUM_SCAN_LINES, /* which block */
	      k = (1==d) ? 3 : 2; /* very 1st row is a special case */
	      
	  /* 
	   * We move some rows from the bottom to the top of the scratch
	   * buffer fir_buf, because we can reuse them in the next pass. 
	   * This variable defines the start row of the rows that will be
	   * "moved"
	   */
	  int srow2copy = FIR_BLOCK_X_SIZE-k*BOUNDARY;
	      
	  /* n is the # of rows to filter */
	  int n = (d != NUM_BLOCKS-1) ? NUM_SCAN_LINES : NUM_SCAN_LINES-BOUNDARY;
	  
	  /* move from the bottom to the top */
	  DSP_blk_move(fir_buf+srow2copy*Y_SIZE, fir_buf, nfir2copy);
	  
	  /* copy from external to internal RAM */
	  DSP_blk_move(in_img+(irow+BOUNDARY)*Y_SIZE, input_buf, n*Y_SIZE);
	  
	  /* run through FIR filter */
	  DSP_fir_gen(input_buf, h, fir_buf+2*BOUNDARY*Y_SIZE, NH, n*Y_SIZE);
	  
	  /* copy the just processed pixels into the final output array */
	  memcpy(out_img+(d-1)*NUM_SCAN_LINES*Y_SIZE, output_buf, NUM_SCAN_LINES*Y_SIZE);
	  
	  ifir = 0;
	  p = output_buf;
	} /* end if (we just finished a block) */
	
	/* setup the pointers to the filtered rows */
	for (ii=0; ii<NH; ++ii)
	  pFirBand[ii] = fir_buf + (ifir+ii)*Y_SIZE;

    /* 1st BOUNDARY columns are set to zero */
	for (ii=0; ii<BOUNDARY; ++ii)
	  *p++ = 0;

    /* in this loop the 2D mask is applied from the 1D filtered rows */
	for (ii=0; ii<X_SIZE-2*BOUNDARY; ++ii) {
	  sum = 0;
	  for (jj=0; jj<NH; ++jj)
	    sum += *(pFirBand[jj])++;
	  *p++ = sum;
	}

    /* last two BOUNDARY columns are set to zero */
	for (ii=0; ii<BOUNDARY; ++ii)
	  *p++ = 0;

	ifir++;
	
  }	/* end (for each row) */
  
  /* algorithm 'epilogue': handle the final block */
 
  // zero out final few rows
  memset(p, 0, BOUNDARY*Y_SIZE); 

  // copy final block into the output buffer
  memcpy(out_img+(NUM_BLOCKS-1)*NUM_SCAN_LINES*Y_SIZE, output_buf, NUM_SCAN_LINES*Y_SIZE);
}

int main(void)
{
  clock_t start, stop, overhead, t = 0; /* timing */
  const int N = 10; /* how many times to profile */
  int ii = 0;
	
  evm_init(); /* initialize the board */
  start = clock(); /* calculate overhead of calling clock*/
  stop = clock();  /* and subtract this value from The results*/
  overhead = stop - start;
	
  for (; ii<N; ++ii) {
    start = clock(); /* begin "profile area" */
      filter_image();
    stop = clock(); /* end "profile area" */
    t += stop-start-overhead;
    printf("# cycles to filter image: %d\n", stop-start-overhead);
  }
  
  printf("avg time is %.2f cycles.\n", (float)t/(float)N);
}
