// ==============================================================================
// Verilog Custom Instruction Template File for Internal Register Logic
module fp_sqrt64(
  clk, // CPU system clock (required for multi-cycle or extended multi-cycle)
  reset, // CPU master asynchronous active high reset (required for multi-cycle or extended multicycle)
  clk_en, // Clock-qualifier (required for multi-cycle or extended multi-cycle)
  start, // Active high signal used to specify that inputs are valid (required for multi-cycle or extended multi-cycle)
  done, // Active high signal used to notify the CPU that result is valid (required for variable multi-cycle or extended variable multi-cycle)
  n, // N-field selector (required for extended)
  dataa, // Operand A (always required)
  datab, // Operand B (optional)
  //======== Add some test ports:
 // output done_delay_out, 
 // output [63:0] sqrt_out, 
 // output [4:0] cnt_out,  
  result); // Result (always required)
// custom instruction logic (note: external interfaces can be used as well)
// use the n[7..0] port as a select signal on a multiplexer to select the value to feed result[31..0]
  input clk; // CPU system clock (required for multi-cycle or extended multi-cycle)
  input reset; // CPU master asynchronous active high reset (required for multi-cycle or extended multicycle)
  input clk_en; // Clock-qualifier (required for multi-cycle or extended multi-cycle)
  input start; // Active high signal used to specify that inputs are valid (required for multi-cycle or extended multi-cycle)
  output done; // Active high signal used to notify the CPU that result is valid (required for variable multi-cycle or extended variable multi-cycle)
  input[7:0] n; // N-field selector (required for extended)
  input[31:0] dataa; // Operand A (always required)
  input[31:0] datab; // Operand B (optional)
  //======== Add some test ports:
 // output done_delay_out, 
 // output [63:0] sqrt_out, 
 // output [4:0] cnt_out,  
  output [31:0]result; // Result (always required)


// =========================================================
  reg [63:0] data;
  wire [63:0] sqrt;
  reg [31:0] r;
  reg [6:0] cnt = 0;
  
// local custom instruction SIGNALs
// custom instruction logic (note: external interfaces can be used as well)
// Use the n[7..0] port as a select SIGNAL on a multiplexer
// to select the value to feed result[31..0]

//  U1: altfp_sqrt
//  GENERIC MAP(pipeline => 30) -- Range single 16-28; double 30-57
  fsqrt64  fsqrt64_inst (
  .clock ( clk ),
  .data ( data ),
  .result ( sqrt )
  );
//  assign sqrt = data;
  
  wire done_delay = (cnt==7'd33)? 1 : 0;
  // Use delay (cnt>32) for reading; no delay for write
  wire ready = (n>8'h01) ? done_delay : start; 
  assign done = ready;
   
  always @ (posedge clk or posedge reset)
  if (reset)  begin
    cnt = 0;
  end else begin
       if ((start) && ((n==8'h02) || (n==8'h03)) && (clk_en) && (cnt==7'd34)) begin // decrement counter
         cnt = 7'd29;
       end
       if ((start) && ((n==8'h00) || (n==8'h01)) && (clk_en)) begin // reset counter for new values
         cnt = 7'd0;
       end
       if (cnt < 7'd34) begin // cout until 34 is reached
         cnt = cnt + 7'd1;
       end
  end
  
  always @ (posedge clk or posedge reset) // get the input data
    if (reset) begin
      data = 32'h0;
    end else begin
      if ((n==8'h00)&& (clk_en)) 
        data[31:0] = dataa;
      else if ((n==8'h01) && (clk_en))
        data[63:32] = dataa;
    end
        
  always @ (posedge clk or posedge reset) // write the results
    if (reset) begin
      r = 32'h0;
    end else begin
      if ((n==8'h02)&& (clk_en) && (cnt==7'd31)) 
        r = sqrt[31:0];
      else if ((n==8'h03)&& (clk_en) && (cnt==7'd31))
        r = sqrt[63:32];
    end

  assign result = r; // connect to outputs

    
  //=== Test port assignments:
  //assign done_delay_out = done_delay;
  //assign sqrt_out = sqrt;
  //assign cnt_out = cnt;
  
  
endmodule
