#include <stdio.h> /* printf() */
#include <dsp_fir_gen.h>
#include <dsp_blk_move.h>
#include <string.h> /* memset() & memcpy() */
#include <time.h> /* clock() */
#define CHIP_6701 /* change appropriately if using a different chipset */
#include <csl.h> 
#include <csl_dma.h>
#include <csl_irq.h>

#include "..\image.h" /* image\kernel dimensions, example pixel data */
#pragma DATA_ALIGN (in_img, 4);
#pragma DATA_SECTION (in_img, "SBSRAM");

#pragma DATA_ALIGN (out_img, 4);
#pragma DATA_SECTION (out_img, "SBSRAM"); 
unsigned char out_img[N_PIXELS]; /* filtered image */

/* constants that define how we march through the image */
#define BOUNDARY         (NH/2) /* 1st and last BOUNDARY rows/cols in output set to 0 */
#define NUM_SCAN_LINES   16 /* # rows constituting a block, must divide evenly into X_SIZE */
#define NUM_BLOCKS       (X_SIZE/NUM_SCAN_LINES) /* # blocks in partitioned image */
#define NUM_IBUF_ROWS    (NUM_SCAN_LINES+2*BOUNDARY) /* # scan-lines in input buf cache */

/* filter coefficients in Q15 format (gaussian kernel) */
short h[NH][NH] = {
  {0,	1,		7,		1,		0},
  {1,	371,	2744,	371,	1},
  {7, 	2744,	20273,	2744,	7},
  {1, 	371,	2744,	371,	1},
  {0, 	1,		7,		1,		0},
};

/* These are scratch buffers, strategically placed in on-chip RAM: 
 *
 * input_buf = input pixels about to be passed through FIR filter
 * fir_buf = output of FIR filter placed in this buffer
 * output_buf = filtered pixels go here
 *
 */
#pragma DATA_ALIGN (input_buf, 4);
short input_buf[NUM_IBUF_ROWS*Y_SIZE + NH-1];
#pragma DATA_ALIGN (fir_buf, 4);
short fir_buf[NH][Y_SIZE + NH-1];
#pragma DATA_ALIGN (output_buf, 4);
unsigned char output_buf[NUM_SCAN_LINES*Y_SIZE];

/* 
 * a couple of macros that convert a group of pixels into 
 * the # of elements the DMA transfer function expects.
 */
#define ELEM_COUNT_UCH(N)   N*sizeof(unsigned char)/sizeof(int)
#define ELEM_COUNT_SHORT(N) N*sizeof(short)/sizeof(int)

/* global variables used in DMA interrupt ISR to indicate completion */
volatile int transfer_done = FALSE;

/* reference to the vector table to call the IRQ ISRs hookup */
extern far void vectors();

/* vecs.asm hooks this up to IRQ 09 */
interrupt void c_int09(void) /* DMA ch1 */
{
  transfer_done = TRUE;
  return;
}

/* set the interrupts */
void set_interrupts_dma(void)
{
  IRQ_nmiEnable();
  IRQ_globalEnable();
  IRQ_disable(IRQ_EVT_DMAINT1); /* INT9 */
  IRQ_clear(IRQ_EVT_DMAINT1);
  IRQ_enable(IRQ_EVT_DMAINT1);
}

void dma_copy_block(void *psrc, void *pdst, int element_count)
{
  static DMA_Handle hDma1;
  hDma1 = DMA_open(DMA_CHA1, DMA_OPEN_RESET); /* Handle to DMA ch1 */
    DMA_configArgs(hDma1,
		DMA_PRICTL_RMK(
		DMA_PRICTL_DSTRLD_DEFAULT,
		DMA_PRICTL_SRCRLD_DEFAULT,
		DMA_PRICTL_EMOD_DEFAULT,
		DMA_PRICTL_FS_DEFAULT,
		DMA_PRICTL_TCINT_ENABLE, /* TCINT =1 */
		DMA_PRICTL_PRI_DMA, /* DMA priority over CPU */
		DMA_PRICTL_WSYNC_DEFAULT,
		DMA_PRICTL_RSYNC_DEFAULT,
		DMA_PRICTL_INDEX_DEFAULT,
		DMA_PRICTL_CNTRLD_DEFAULT,
		DMA_PRICTL_SPLIT_DISABLE,
		DMA_PRICTL_ESIZE_32BIT, /* 32-bit element size */
		DMA_PRICTL_DSTDIR_INC, /* incr dest by element size */
		DMA_PRICTL_SRCDIR_INC, /* incr src by element size */
		DMA_PRICTL_START_DEFAULT
		),
	DMA_SECCTL_RMK(
		DMA_SECCTL_DMACEN_DEFAULT,
		DMA_SECCTL_WSYNCCLR_DEFAULT,
		DMA_SECCTL_WSYNCSTAT_DEFAULT,
		DMA_SECCTL_RSYNCCLR_DEFAULT,
		DMA_SECCTL_RSYNCSTAT_DEFAULT,
		DMA_SECCTL_WDROPIE_DEFAULT,
		DMA_SECCTL_WDROPCOND_DEFAULT,
		DMA_SECCTL_RDROPIE_DEFAULT,
		DMA_SECCTL_RDROPCOND_DEFAULT,
		DMA_SECCTL_BLOCKIE_ENABLE, // BLOCK IE=1 enables DMA channel int
		DMA_SECCTL_BLOCKCOND_DEFAULT,
		DMA_SECCTL_LASTIE_DEFAULT,
		DMA_SECCTL_LASTCOND_DEFAULT,
		DMA_SECCTL_FRAMEIE_DEFAULT,
		DMA_SECCTL_FRAMECOND_DEFAULT,
		DMA_SECCTL_SXIE_DEFAULT,
		DMA_SECCTL_SXCOND_DEFAULT
		),
	DMA_SRC_RMK((Uint32)psrc), /* source buffer */
	DMA_DST_RMK((Uint32)pdst), /* destination buffer */
	DMA_XFRCNT_RMK(
		DMA_XFRCNT_FRMCNT_DEFAULT,
		DMA_XFRCNT_ELECNT_OF(element_count) /* set xfer element count */
	)
  );

  /* initialize the interrupts: */
  /* Enable the interrupts after the DMA channels are opened */
  /* as the DMA_OPEN_RESET clears and disables the channel */
  /* interrupt when specified and clears the corresponding */
  /* interrupt bits in the IER. */
  set_interrupts_dma();
  
  transfer_done = FALSE;
  DMA_start(hDma1); /* start DMA channel 1 */
  
  /* To flag an interrupt to the CPU when DMA transfer/receive is done */
  while (!transfer_done);

  DMA_close(hDma1); /* close the channel when the transfer is complete */
}

void filter_image()
{
  int irow, icol, ifir; /* counters */
  int sum, nlines2copy;
  /* 
   * pout_buf initially points BOUNDARY rows from the
   * top because first BOUNDARY rows are set to 0.
   */  
  unsigned char *pout_buf = output_buf + BOUNDARY*Y_SIZE;
  /*
   * pscanlines is a vector of pointers, pointing to the
   * NH image rows to be passed through the filter.
   */
  short *pscanlines[NH];
  /* 
   * pin_save is used in the main loop kernel: each time we 
   * page in a new block, we can reuse the last 2*BOUNDARY
   * rows from the previous block if we "move it to the
   * top," so here I'm setting this address as a convienence
   */
  const short *pin_save 
    = &input_buf[NUM_IBUF_ROWS*Y_SIZE - 2*BOUNDARY*Y_SIZE];
  short *pin_img;
  unsigned char *pout_img;
  
  /* 
   * algorithm 'prologue': move 1st block into on-chip RAM and filter.
   * (1st time around do not need to use all of input_buf)
   */
  dma_copy_block(in_img, 
                 input_buf+BOUNDARY*Y_SIZE, 
                 ELEM_COUNT_SHORT((NUM_IBUF_ROWS-BOUNDARY)*Y_SIZE));
  memset(output_buf, 0, BOUNDARY*Y_SIZE); /* 1st BOUNDARY rows are zero */

  /*
   * initialize pscanlines vector of pointers, initially the 
   * very pointer indexes into the 1st row in the current block.
   */ 
  for (ifir=0; ifir<NH; ++ifir)
    pscanlines[ifir] = &input_buf[(BOUNDARY+ifir)*Y_SIZE];
    
  /* 
   * march down the image and filter each row, and 
   * then sum down columns to implement a 2D filter
   */
  for (irow=BOUNDARY; irow<NUM_SCAN_LINES; ++irow, pout_buf+=Y_SIZE)
  {
    /* filter the pixels */
    for (ifir=0; ifir<NH; ++ifir)
	  DSP_fir_gen(pscanlines[ifir], &h[ifir][0], &fir_buf[ifir][0], NH, Y_SIZE);

    /* just like the rows, 1st and last BOUNDARY cols are zero */
    for (icol=0; icol<BOUNDARY; ++icol)
	  pout_buf[icol] = pout_buf[Y_SIZE-icol-1] = 0;
	
	/* combine row-filtered output via summation => 2D filtered result */
	for (icol=BOUNDARY; icol<Y_SIZE-BOUNDARY; ++icol) {
	  sum = 0;		
	  for (ifir=0; ifir<NH; ++ifir)
	    sum += fir_buf[ifir][icol];
	  pout_buf[icol] = sum;
	}

    /* get ready for next row, by incrementing pscanlines */
	for (ifir=0; ifir<NH-1; ++ifir)
	  pscanlines[ifir] = pscanlines[ifir+1];
	pscanlines[NH-1] += Y_SIZE;
  } /* end (for each row in the 1st block) */  
  
  /* loop kernel: filter the rest of the image */
  pin_img = &in_img[(NUM_SCAN_LINES+BOUNDARY)*Y_SIZE],
  pout_img = out_img;
  
  for (; irow<Y_SIZE-BOUNDARY; ++irow, pout_buf+=Y_SIZE)
  {
    if (0 == irow%NUM_BLOCKS) /* time to page in the next block of pixels */
	{
	  /* copy the just processed block to the output image */
	  dma_copy_block(output_buf, 
	                 pout_img, 
	                 ELEM_COUNT_UCH(NUM_SCAN_LINES*Y_SIZE));
	  
	  /* move last few scan-lines in cache to the top */
	  memcpy(input_buf, pin_save, 2*BOUNDARY*Y_SIZE*sizeof(short));
	  
	  /* # of scan-lines to page into internal memory */
	  nlines2copy = (NUM_BLOCKS-1 != irow/NUM_SCAN_LINES) ? NUM_SCAN_LINES : NUM_SCAN_LINES-BOUNDARY;
	  
	  /* page in next set of image pixels to internal RAM */
	  dma_copy_block(pin_img, 
	                 input_buf + 2*BOUNDARY*Y_SIZE, 
	                 ELEM_COUNT_SHORT(nlines2copy*Y_SIZE));	

      /* 
       * and get ready for the next round of processing 
       * (reset pointer into output buffer residing in internal RAM)
       */
	  pout_buf = output_buf;
	  
	  /* reset the scan-line pointers into the cached input buffer */
	  for (ifir=0; ifir<NH; ++ifir)
	    pscanlines[ifir] = &input_buf[ifir*Y_SIZE];

      /* increment pointers to input and output images */
	  pin_img += NUM_SCAN_LINES*Y_SIZE;
	  pout_img += NUM_SCAN_LINES*Y_SIZE;
    } /* end (if time to page in next block of pixels) */

    /* filter the pixels */
	for (ifir=0; ifir<NH; ++ifir)
	  DSP_fir_gen(pscanlines[ifir], &h[ifir][0], &fir_buf[ifir][0], NH, Y_SIZE);

    /* just like the rows, 1st and last BOUNDARY cols are zero */
	for (icol=0; icol<BOUNDARY; ++icol)
	  pout_buf[icol] = pout_buf[Y_SIZE-icol-1] = 0;

    /* combine row-filtered output via summation => 2D filtered result */
	for (icol=BOUNDARY; icol<Y_SIZE-BOUNDARY; ++icol) {
	  sum = 0;
	  for (ifir=0; ifir<NH; ++ifir)
	    sum += fir_buf[ifir][icol];
	  pout_buf[icol] = sum;
	}

    /* get ready for next row, by incrementing pscanlines */
	for (ifir=0; ifir<NH-1; ++ifir)
      pscanlines[ifir] = pscanlines[ifir+1];
	pscanlines[NH-1] += Y_SIZE;
  }  /* end (for each interior row) */
  
  /* zero out final few rows */
  memset(pout_buf, 0, BOUNDARY*Y_SIZE); 

  /* copy final block into the output buffer */
  dma_copy_block(output_buf,
                 out_img+(NUM_BLOCKS-1)*NUM_SCAN_LINES*Y_SIZE,
                 ELEM_COUNT_UCH(NUM_SCAN_LINES*Y_SIZE));
}

int main(void)
{
  clock_t start, stop, overhead, t = 0; /* timing */
  const int N = 10; /* how many times to profile */
  int ii = 0;

  IRQ_setVecs(vectors); /* point to the IRQ vector table */
  CSL_init(); /* initialize chip support library */
  DMA_reset(INV);
  start = clock(); /* calculate overhead of calling clock*/
  stop = clock();  /* and subtract this value from The results*/
  overhead = stop - start;
	
  for (; ii<N; ++ii) {
    start = clock(); /* begin "profile area" */
      filter_image();
    stop = clock(); /* end "profile area" */
    t += stop-start-overhead;
    printf("# cycles to filter image: %d\n", stop-start-overhead);
  }
  
  printf("avg time is %.2f cycles.\n", (float)t/(float)N);
}
