// ======================================================================
// This program measures the sqrt() floating-point performance of the  DE1-SoC board
//
// It performs the following: 
//   1. Starts the timer 
//   2. runs a loop with 10 ops 10K times
//   3. computes the time needed and displays sqrt(SW) on the LEDs
// ======================================================================
#define LEDR_BASE    0xFF200000
#define SW_BASE      0xFF200040
#define KEY_BASE     0xFF200050
#define TIMER_BASE   0xFF202000
#define Fcpu         100000000 
#define DEBUG 0

#include <stdio.h>      /* printf */
#include <math.h>       /* sqrt */
#include <stdlib.h>     /* abs */

//The n values and their corresponding operation are as follow:
// n = 0, Write 32 bits LSB input data to custom instruction
// n = 1, Write 32 bits MSB input data to custom instruction
// n = 2, Read  32 bits LSB data from the custom instruction
// n = 3, Read  32 bits MSB data from the custom instruction

#define ALT_CI_FP_SQRT64_0_N 0x0
#define ALT_CI_FP_SQRT64_0_N_MASK ((1<<8)-1)
#define my_sqrt64(n,A) \
    __builtin_custom_ini(ALT_CI_FP_SQRT64_0_N+(n&ALT_CI_FP_SQRT64_0_N_MASK),(A))

int main()
{
  volatile int *red_LED_ptr   = (int *) LEDR_BASE;  // Red LED address
  volatile int *SW_switch_ptr = (int *) SW_BASE;    // Slider switch address
  volatile int * interval_timer_ptr = (int *) TIMER_BASE; // Timer address
  int high_half, counter;
  int SW_value;  
  int Value1, Value21=0;
  int seconds=0;
  float j, result;
  float Tscale, Cscale;
  int k;
  
  double d[4];  double *d_ptr; 
  int i[8];    int *i_ptr;

//================================================
// Out=123..8|In:12..8^2 |Output:4.0 |Input 16.0 |
// MSB  LSBs | MSB  LSBs | MSB  LSBs | MSB  LSBs |
//    d[3]   |   d[2]    |   d[1]    |   d[0]    |
// i[7] i[6] | i[5] i[4] | i[3] i[2] | i[1] i[0] |
//================================================

  printf("Hello from Nios II SQRT64 CIP\n\r");
  printf("Please set SW value to 16 for SQRT\n\r");
  SW_value = *(SW_switch_ptr);  // Read the SW slider switch values
  d[0] = (double) SW_value;  // initial first double to 16.0
  d_ptr = & d[0]; // Get starting address
  i_ptr = (int *) d_ptr;  // point to same address
  i[0] = (unsigned int) *i_ptr; // first 32 bits
  i[1] = (unsigned int) *(i_ptr+1); // second 32 bits
  printf("LSBs   : %08d_10=%08X_hex\n", i[0], (unsigned int) i[0]);
  printf("MSBs   : %08d_10=%08X_hex\n", i[1], (unsigned int) i[1]);
  
  k=0x12345678;  
  d[2] = (double) k;  // initial second double to none trivial
  d[2] = d[2] * d[2];
  d_ptr = & d[2]; // Get starting address
  i_ptr = (int *) d_ptr;  // point to same address
  i[4] = (unsigned int) *i_ptr; // first 32 bits
  i[5] = (unsigned int) *(i_ptr+1); // second 32 bits
  printf("LSBs   : %08d_10=%08X_hex\n", i[4], (unsigned int) i[4]);
  printf("MSBs   : %08d_10=%08X_hex\n", i[5], (unsigned int) i[5]);
    
  *(red_LED_ptr) = 0;
  //lsb_in=0x00000000;
  //msb_in=0x40300000;
  printf("====== Try: 16.0 (FP dec) 4.0=> 4010....0 (FP hex) ======\n");
  printf("with x=0x40100000 in hex x*x=40300000_00000000 in hex\n");  
  printf("write to  CIP: LSBs   : %08d_10=%08X_hex\n", i[0], (unsigned int) i[1]);
  k=my_sqrt64(0, i[0]);  
  printf("write to  CIP: MSBs   : %08d_10=%08X_hex\n", i[1], (unsigned int) i[1]);
  k=my_sqrt64(1, i[1]);   
  i[2] = my_sqrt64(2, 0);
  printf("read from CIP: LSBs   : %08d_10=%08X_hex\n", i[2], (unsigned int) i[2]);
  i[3] = my_sqrt64(3, 0);   
  printf("read from CIP: MSBs   : %08d_10=%08X_hex\n", i[3], (unsigned int) i[3]);
  d_ptr = & d[1]; // Get starting address
  i_ptr = (int *) d_ptr;  // point to same address
  *(i_ptr) = i[2];
  *(i_ptr+1) = i[3];
  *(red_LED_ptr) = (int) d[1];
  printf("================= repeat with 2. number =================\n");
  printf("with x=0x12345678(hex)=>FP: 41B23456_78000000 gives x*x=4374B66D_C1DF4D84 hex\n");
  //lsb = 0xC1DF4D84 = 3252637060 (dec) via MS calc
  //msb = 0x4374B66D = 1131722349 (dec)
  printf("write to  CIP: LSBs   : %08d_10=%08X_hex\n", i[4], (unsigned int) i[4]);
  k=my_sqrt64(0, i[4]);  
  printf("write to  CIP: MSBs   : %08d_10=%08X_hex\n", i[5], (unsigned int) i[5]);
  k=my_sqrt64(1, i[5]);   
  i[6] = my_sqrt64(2, 0);
  printf("read from CIP: LSBs   : %08d_10=%08X_hex\n", i[6], (unsigned int) i[6]);
  i[7] = my_sqrt64(3, 0);   
  printf("read from CIP: MSBs   : %08d_10=%08X_hex\n", i[7], (unsigned int) i[7]);
  d_ptr = & d[3]; // Get starting address
  i_ptr = (int *) d_ptr;  // point to same address
  *(i_ptr) = i[6];
  *(i_ptr+1) = i[7];
  if (DEBUG) {
  printf("========== All double ...\n");
  for (k=0;k<4;k++)
    printf(" double[%d] = %lf\n",k,d[k]);
  printf("========== All integer ...\n");    
  for (k=0;k<8;k++)  
    printf(" int[%d] = %08d_10=%08X_hex\n",k,i[k], (unsigned int) i[k]);  
}
  printf("=========== Time standard double sqrt() ... =============\n");
  //start timer  
  SW_value = *(SW_switch_ptr);  // Read the SW slider switch values
  d[0] = (double) SW_value; 
  printf("Set SW for Sqrt() SW value = %d\n",SW_value);  
  *(interval_timer_ptr + 1) = 0x8;  // Set STOP=1  START = 0, CONT = 0, ITO = 0
  *(interval_timer_ptr + 0x2) = 0xFFFF;  *(interval_timer_ptr + 0x3) = 0x7FFF;
  *(interval_timer_ptr + 1) = 0x4;  // Set START = 1, CONT = 0, ITO = 0
              d[1] = sqrt(d[0]);     
  // Stop the counter
  *(interval_timer_ptr + 1) = 0x8;  // Set STOP=1  START = 0, CONT = 0, ITO = 0
  *(interval_timer_ptr + 0x4) =  0;
  high_half = *(interval_timer_ptr + 0x5) & 0xFFFF;
  counter = (*(interval_timer_ptr + 0x4) & 0xFFFF) | (high_half << 16);
  Value21 = (0x7FFFFFFF - counter); // Clock cycle divided by CPU frequency
  printf("Clock cycles single sqrt() = %d\n",Value21);  
  printf("FP single sqrt result: %d\n\n", (int) d[1]);
  

  printf("========== Time CIP double my_sqrt64() with pointer ops  ===========\n");
  //start timer
  *(interval_timer_ptr + 1) = 0x8;  // Set STOP=1  START = 0, CONT = 0, ITO = 0
  *(interval_timer_ptr + 0x2) = 0xFFFF;  *(interval_timer_ptr + 0x3) = 0x7FFF;
  *(interval_timer_ptr + 1) = 0x4;  // Set START = 1, CONT = 0, ITO = 0
  d_ptr = & d[0]; // Get starting address
  i_ptr = (int *) d_ptr;  // point to same address
  i[0] = (unsigned int) *i_ptr; // first 32 bits
  i[1] = (unsigned int) *(i_ptr+1); // second 32 bits    
  k = my_sqrt64(0, i[0]);  // Write to CIP LSBs
  k = my_sqrt64(1, i[1]);  // Write to CIP MSBs 
  i[2] = my_sqrt64(2, 0);  // Read the SQRT64 LSBs
  i[3] = my_sqrt64(3, 0);  // Read the SQRT64 MSBs 
  d_ptr = & d[1]; // Get starting address
  i_ptr = (int *) d_ptr;  // Point to same address  
  *(i_ptr) = i[2]; // Place LSBs in double
  *(i_ptr+1) = i[3];  // MSBs in double
  // Stop the counter
  *(interval_timer_ptr + 1) = 0x8;  // Set STOP=1  START = 0, CONT = 0, ITO = 0
  *(interval_timer_ptr + 0x4) =  0;
  high_half = *(interval_timer_ptr + 0x5) & 0xFFFF;
  counter = (*(interval_timer_ptr + 0x4) & 0xFFFF) | (high_half << 16);
  Value21 = (0x7FFFFFFF - counter); // Clock cycle divided by CPU frequency
  printf("Clock cycles single (with ptr ops) my_sqrt64() = %d\n",Value21); 
  printf("FP single sqrt64 result: %d\n\n", (int) d[1]);
  
  printf("========== Time CIP double my_sqrt64() no pointer ops===========\n");
  //start timer
  *(interval_timer_ptr + 1) = 0x8;  // Set STOP=1  START = 0, CONT = 0, ITO = 0
  *(interval_timer_ptr + 0x2) = 0xFFFF;  *(interval_timer_ptr + 0x3) = 0x7FFF;
  *(interval_timer_ptr + 1) = 0x4;  // Set START = 1, CONT = 0, ITO = 0
  k=my_sqrt64(0, i[0]); // Write LSBs 
  k=my_sqrt64(1, i[1]); // Write MSBs
  i[2] = my_sqrt64(2, 0); // Read LSBs
  i[3] = my_sqrt64(3, 0); // Read MSBs
  // Stop the counter
  *(interval_timer_ptr + 1) = 0x8;  // Set STOP=1  START = 0, CONT = 0, ITO = 0
  *(interval_timer_ptr + 0x4) =  0;
  high_half = *(interval_timer_ptr + 0x5) & 0xFFFF;
  counter = (*(interval_timer_ptr + 0x4) & 0xFFFF) | (high_half << 16);
  Value21 = (0x7FFFFFFF - counter); // Clock cycle divided by CPU frequency
  printf("Clock cycles single (no ptr ops) my_sqrt64() = %d\n",Value21); 
  printf("FP single sqrt64 result: %d\n\n", (int) d[1]);
  
  //================ FP SQRT loop test =====================
  Tscale = 10.0/Fcpu*1000;
  Cscale = 1.0/100000.0;
  j = 4.0; *(red_LED_ptr) = 16;
  result = j * j;
  Value1=sqrt(result);
  printf("Sqrt( 4.0*4.0 )= %d\r\n",Value1);
  //start timer
  *(interval_timer_ptr + 1) = 0x8;  // Set STOP=1  START = 0, CONT = 0, ITO = 0
  *(interval_timer_ptr + 0x2) = 0xFFFF;  *(interval_timer_ptr + 0x3) = 0x7FFF;
  *(interval_timer_ptr + 1) = 0x4;  // Set START = 1, CONT = 0, ITO = 0
  for(k = 0; k < 10000; k++){
    // sqrt in double precision
    result =sqrt(result);result = sqrt(result);result = sqrt(result);
    result =sqrt(result);result = sqrt(result);result = sqrt(result);
    result =sqrt(result);result = sqrt(result);result = sqrt(result);
    result =sqrt(result);
  }
  // Stop the counter
  *(interval_timer_ptr + 1) = 0x8;  // Set STOP=1  START = 0, CONT = 0, ITO = 0
  // Make a counter snapshot by wrting a dummy value to snapl
  *(interval_timer_ptr + 0x4) =  0;
  // read the 32-bit counter snapshot from the 16-bit timer registers
  high_half = *(interval_timer_ptr + 0x5) & 0xFFFF;
  counter = (*(interval_timer_ptr + 0x4) & 0xFFFF) | (high_half << 16);
  Value21 = (0x7FFFFFFF - counter); // Clock cycle divided by CPU frequency
  printf("Clock cycles  = %d\n",Value21);
  printf("FP sqrt cycles: %d\n", (int) (Value21*Cscale));
  printf("FP sqrt   time: %d ns\n", (int) (Value21*Tscale));
  //Note: this output is required otherwise loop is optimized by compiler
  printf("FP 100K sqrt result: %d\n\n", (int) result);

  //================ FP sqrtf test ===============
    j = 4.0; *(red_LED_ptr) = 32;
    result =    sqrtf( j * j);
    Value1=result;
    printf("Sqrtf 16  = %d\r\n",Value1);
    //start timer
    *(interval_timer_ptr + 1) = 0x8;  // Set STOP=1  START = 0, CONT = 0, ITO = 0  
    *(interval_timer_ptr + 0x2) = 0xFFFF;  *(interval_timer_ptr + 0x3) = 0x7FFF;
    *(interval_timer_ptr + 1) = 0x4;  // Set START = 1, CONT = 0, ITO = 0
    for(k = 0; k < 10000; k++){
    //multiplication
    result =   sqrtf( result ); result =   sqrtf( result );result =   sqrtf( result );
    result =   sqrtf( result );result =   sqrtf( result );result =   sqrtf( result );
    result =   sqrtf( result );result =   sqrtf( result );result =   sqrtf( result );
    result =   sqrtf( result );
    }
    // Stop the counter
    *(interval_timer_ptr + 1) = 0x8;  // Set STOP=1  START = 0, CONT = 0, ITO = 0 
    // Make a counter snapshot by wrting a dummy value to snapl
    *(interval_timer_ptr + 0x4) =  0;
    // read the 32-bit counter snapshot from the 16-bit timer registers
  high_half = *(interval_timer_ptr + 0x5) & 0xFFFF;
  counter = (*(interval_timer_ptr + 0x4) & 0xFFFF) | (high_half << 16);
    Value21 = (0x7FFFFFFF - counter); // Clock cycle divided by CPU frequency
    printf("Clock cycles  = %d\n",Value21);
    printf("FP sqrtf cycles: %d\n", (int) (Value21*Cscale));
    printf("FP sqrtf   time: %d ns\n", (int) (Value21*Tscale));
    //Note: this output is required otherwise loop is optimized by compiler
    printf("FP 100K SQRTF result: %d\r\n\r\n", (int) result);

  printf("\r\nSuccessfully ran FP SQRT measurements on Nios II\n");

  while(1) { // Run Sqrt using the SW values; display on LEDs
    SW_value = *(SW_switch_ptr);  // Read the SW slider switch values
    d[0] = (double) SW_value;  // Initial double e.g. 25 or 100
    d_ptr = & d[0]; // Get starting address
    i_ptr = (int *) d_ptr;  // Point to same address
    i[0] = (unsigned int) *i_ptr; // First 32 bits
    i[1] = (unsigned int) *(i_ptr+1); // Second 32 bits    
    k = my_sqrt64(0, i[0]);  // Write to CIP LSBs
    k = my_sqrt64(1, i[1]);  // Write to CIP MSBs 
    i[2] = my_sqrt64(2, 0);  // Read the SQRT64 LSBs
    i[3] = my_sqrt64(3, 0);  // Read the SQRT64 MSBs 
    d_ptr = & d[1]; // Get starting address
    i_ptr = (int *) d_ptr;  // Point to same address  
    *(i_ptr) = i[2]; // Place LSBs in double d[1]
    *(i_ptr+1) = i[3];  // MSBs in double d[1]
    *(red_LED_ptr) = (int) d[1];  // Display at LEDs    
  }
    return 0;
}
