/*
 *	File: lnreChi2.c
 *
 *      (C) IWTS
 *          KU Nijmegen
 *          The Netherlands
 *
 *      Author: R. Harald Baayen
 *		Stephen Tweedie
 *
 *      History:
 *
 *      - feb 1997, version 1.0 (rhb)
 *	- nov 1998, version 1.1 (rhb, st)
 *
 *      Description:
 *
 *      
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <malloc.h>
#include <math.h>
#include "lex_cons.h"


/* EXTERN FUNCTIONS */

/* functions for numerical procedures */

extern double	bincoef ();
extern double	factln ();
extern double	gammln ();
extern double	gammq ();
extern void	ludcmp ();
extern void	lubksb ();

extern int	*vectorI ();
extern double	*vector ();
extern double	**matrix ();
extern void	free_vector ();
extern void	free_matrix ();

/* argument reading, file manipulation, and help function */

extern int	leesgetal ();
extern void	change_extension ();
extern void	help ();


/* GLOBAL VARIABLES */

double   N, V, EV, N2, EV2,        /* number of tokens, types, at N, 2N   */
         SPECTRUM[MAXM2][3],       /* frequency spectrum m Vm EVm at N    */
         SPECTRUM2N[MAXM2][2],     /* frequency spectrum m EVm at 2N      */
         x, y, z, d, *COL,                
         chisquare,                /* X^2                                 */
         mse,                      /* mean squared error                  */
         relmse,                   /* relative mean squared error         */
         Vplus, EVplus,            /* V+, E[V]+                           */
         p, q,                     /* prob and (1-prob)                   */
         *ERROR,                   /* dynamic error vector                */
         *VECTOR,                  /* dynamic vector for quadratic form   */
         **COV,                    /* dynamic array for covariance matrix */
         **ICOV;                   /* dynamic array for inverse of COV    */

FILE     *fpexpspect,              /* fpspectrum: datafile: m, Vm, EVm  at N */
         *fpexpspect2N,            /* spectrum (m, Vm) at 2N               */
         *fpVN,                    /* file with E[V(N)] and E[V(2N)]       */
         *fpchi2,                  /* file with Chi2 output                */
         *fpcov;                   /* file with covariance matrix          */

int      nranks,                   /* number of different ranks in spectrum */
         nranks2,                  /* number of different ranks in spectrum 2N */
         maxrank,                  /* largest rank for fit, default 15 */
         i, j,                     /* counters */
         df,                       /* degrees of freedom */
         *indx,                    /* for matrix inversion */
         header,                   /* boolean for presence of header */
         token, type,              /* auxiliary variables for scanf */
         aantal,                   /* for command line options */
         adjusted,                 /* adjusted Zipf file names with code .aZ. */
         ad2Sgam,                  /* boolean op ad2Sgam fit */
         nparam;                   /* number of parameters of the model */

char     woord[MAXWORDLENGTH],     /* variable for skipping header in fscanf */
         new_name[MAXWORDLENGTH],  /* variables for extension handling    */
         base_name[MAXWORDLENGTH],
         *fs,                      /* variables for scanning options */
         tag;                      /* model tag for file extensions */


int main (argc, argv)
int     argc;
char    *argv[];

{
   /* DEFAULT */

   tag = 'S';
   nparam = 2;
   adjusted = 0;
   ad2Sgam = 0;

   /* READ COMMAND LINE OPTIONS */

   while ((--argc > 0) && ((*++argv)[0] == '-')) {
        for (fs = argv[0] + 1; *fs != '\0'; fs++) {
            switch (*fs) {
            case 'h':
                help();
                break;
            case 'm':
                maxrank = leesgetal (fs, &aantal);
                for (; aantal > 0; aantal--){
                   fs++;
                }
                break;
            default:
                fprintf(stderr, "lnreChi2: illegal option %c\n", *fs);
                exit(1);
                break;
            }
        }
   } /* of while */

   if (argc == 0) {
     help ();
   }

   /* FILE HANDLING */


   if ((fpexpspect = fopen(*argv, "r")) == NULL) {
       fprintf(stderr, "lnreChi2: can't open %s\n", *argv);
       exit(1);
   }

   /* construct base name tag */

   /* strip .spc */

   strncpy(base_name, *argv, strlen(*argv) - 4);  

   j = 0;
   for (i = 0; (i<= MAXWORDLENGTH) && (base_name[i] != '\0'); i++){
        if (base_name[i] == '_'){
             j = i;            
             break;
        }
   }
   
   switch(base_name[j+1]){
       case 'a': if (base_name[j+2] == 'S'){
                      nparam = 3;
                 }
                 else{  /* Zipf */
                      nparam = 2;
                 }
                 break;
       case 'b': if (base_name[j+2] == 'Z'){
                      nparam = 2;
                 }
                 else{  
                    if (base_name[j+2] == 'G'){
                      nparam = 4;
                      ad2Sgam = 1;
                    }
                    else{
                       if (base_name[j+2] == 'Y'){
                           nparam =3;
                       }
                       else{  /* Sichel with g=-0.5 */
                           nparam = 3;
                       }
                    }
                 }
                 break;
       case 'Z': nparam = 1;
                 break;
       case 'S': nparam = 2;
                 break;
       case 'C': if (base_name[j+2] == 'C') {
                             nparam = 5;
                 } else {
                   if (base_name[j+2] == 'Y') {
                             nparam = 6;
                   } else {
                      if (base_name[j+2] == 'G') {
                             nparam = 6;
                      } else {
                             nparam = 2;
                      }
                   }
                 }
                 break;
       case 'Y': if (base_name[j+2] == 'C') {
                             nparam = 6;
                 } else {
                   if (base_name[j+2] == 'Y') {
                             nparam = 7;
                   } else {
                      if (base_name[j+2] == 'G') {
                             nparam = 7;
                      } else {
                             nparam = 3;
                      }
                   }
                 }
                 break;
       case 'G': if (base_name[j+2] == 'C') {
                             nparam = 6;
                 } else {
                   if (base_name[j+2] == 'Y') {
                             nparam = 7;
                   } else {
                      if (base_name[j+2] == 'G') {
                             nparam = 7;
                      } else {
                             nparam = 3;
                      }
                   }
                 }
                 break;
       default:  fprintf(stderr, "lnreChi2: unknown model type\n");
                 exit(1);
                 break;
   }

   change_extension (base_name, new_name, ".sp2");
   if ((fpexpspect2N = fopen(new_name, "r")) == NULL){
      fprintf(stderr, "lnreChi2: can't open input file %s\n", new_name);
      exit(1);
   }
   change_extension (base_name, new_name, ".ev2");
   if ((fpVN = fopen(new_name, "r")) == NULL){
      fprintf(stderr, "lnreChi2: can't open output file %s\n", new_name);
      exit(1);
   }
   change_extension (base_name, new_name, ".chi");
   if ((fpchi2 = fopen(new_name, "w")) == NULL){
      fprintf(stderr, "lnreChi2: can't open output file %s\n", new_name);
      exit(1);
   }
   change_extension (base_name, new_name, ".cov");
   if ((fpcov = fopen(new_name, "w")) == NULL){
      fprintf(stderr, "lnreChi2: can't open output file %s\n", new_name);
      exit(1);
   }

   /* LOADING THE DATA */

   /* LOAD SPECTRUM AT N */

   /* skip header */

   fscanf(fpexpspect, "%s ", woord);  /* m */
   fscanf(fpexpspect, "%s ", woord);  /* Vm */
   fscanf(fpexpspect, "%s ", woord);  /* EVm */
   fscanf(fpexpspect, "%s ", woord);  /* alphaM of EVmBase*/
   fscanf(fpexpspect, "%s ", woord);  /* EalphaM of EVmComplement */

   nranks = 0; 
   while (fscanf(fpexpspect, "%d %d %lf %lf %lf", 
          &token, &type, &x, &y, &z) != EOF)  {
        nranks++;
        if (nranks > (MAXM2-1)){
            fprintf(stderr, "lnreChi2: ERROR: too many ranks in \n"); 
            exit(1);
        }
        SPECTRUM[nranks][0] = (double) token;
        SPECTRUM[nranks][1] = (double) type;
        SPECTRUM[nranks][2] = x;
   }
   fclose(fpexpspect);

   /* LOAD SPECTRUM AT 2N */

   /* skip header */

   fscanf(fpexpspect2N, "%s ", woord);  /* m */
   fscanf(fpexpspect2N, "%s ", woord);  /* EVm2N */

   nranks2 = 0;
   while (fscanf(fpexpspect2N, "%lf %lf", &x, &y) != EOF)  {
        nranks2++;
        if (nranks2 > (MAXM2 - 1)){
            fprintf(stderr, "lnreChi2: ERROR: too many ranks\n"); 
            exit(1);
        }
        SPECTRUM2N[nranks2][0] = x;
        SPECTRUM2N[nranks2][1] = y;
   }
   fclose(fpexpspect2N);

   fscanf(fpVN, "%s ", woord);  /* V */
   fscanf(fpVN, "%s ", woord);  /* EV */
   fscanf(fpVN, "%s ", woord);  /* EV2N */
   if (ad2Sgam == 1) fscanf(fpVN, "%s ", woord);  /* EV2Norig */
   fscanf(fpVN, "%lf %lf %lf", &V, &EV, &EV2);
   fclose(fpVN);

   /* CONSTRUCT ERROR VECTOR AND CALCULATE mse ON THE WAY */

   if ((maxrank != 0) && (maxrank <= nranks)) nranks = maxrank;  
   mse = NULL_F; relmse = NULL_F;
   ERROR = vector (1, nranks + 1);
   ERROR[1] = V - EV;
   mse += (ERROR[1] * ERROR[1]);
   relmse += ((ERROR[1] * ERROR[1])/(V*V));
   for (i = 1; i <= nranks; i++){
       ERROR[i+1] = SPECTRUM[i][1] - SPECTRUM[i][2];
       mse += (ERROR[i] * ERROR[i]);
       relmse += ((ERROR[i] * ERROR[i])/(V*V));
   }

   Vplus = NULL_F; EVplus = NULL_F;
   for (i = 1; i <= nranks; i++){
      Vplus += SPECTRUM[i][1];
      EVplus += SPECTRUM[i][2];
   }
   Vplus = V - Vplus;
   EVplus = EV - EVplus;
   mse +=  (Vplus - EVplus) * (Vplus - EVplus);
   relmse +=  (((Vplus - EVplus) * (Vplus - EVplus))/(V*V));
   

   fprintf(fpchi2, "Goodness-of-fit evaluation for %s\n\n", *argv);
   fprintf(fpchi2,"Mean squared error (MSE): %.4f\n", mse/( (double) nranks+2));
   fprintf(fpchi2,"Relative Mean squared error (rMSE): %10.8f\n", 
                   relmse/( (double) nranks+2));
   fprintf(fpchi2, "The error vector is:\n");
   fprintf(fpchi2, "%10s  %10.3f\n", "V", ERROR[1]);
   for (i = 2; i <= nranks+1; i++){
      fprintf(fpchi2, "%10d  %10.3f\n", i-1, ERROR[i]);
   }
   fprintf(fpchi2, "V+ = %.2f EV+ = %.2f\n\n", Vplus, EVplus);

   /* CONSTRUCT COVARIANCE MATRIX */

   /* FIRST ALLOCATE REQUIRED MEMORY */

   COV = matrix (1, nranks + 1, 1, nranks + 1);
   ICOV = matrix (1, nranks + 1, 1, nranks + 1);

   /* THEN ADD VALUES */

   /*
   m = 0, k = 1,..,maxrank:  COV[V(N), V(k,N)]
   k = 0, m = 1,...,maxrank: idem
   */

   COV[1][1] = EV2 - EV;

   for (i = 1; i <= nranks; i++) {
       COV[1][i+1] = (1.0 / pow(2.0, (double) i) ) * SPECTRUM2N[i][1];
       COV[i+1][1] = COV[1][i+1];
   }

   /* covariances of V(m,N) and V(k,N) */

   for (i = 1; i <= nranks; i++){
     for (j = 1; j <= nranks; j++){
        if (i == j){
            COV[i+1][j+1] = SPECTRUM[i][2] -
                     ( bincoef(i+j,i)
                     * ( 1.0 /  pow(2.0, (double) (i+j) )     ) 
                     * SPECTRUM2N[i+j][1] );
        }
        else{
            COV[i+1][j+1] = - ( bincoef(i+j,i)
                     * ( 1.0 /  pow(2.0, (double) (i+j) )     ) 
                     * SPECTRUM2N[i+j][1] );
        }
     }
   }

   /* PRINT COV */

   for (i = 1; i <= nranks+1; i++){
      for (j = 1; j <= nranks+1; j++){
         fprintf(fpcov, "%8.2f ", COV[i][j]);
      }
      fprintf(fpcov, "\n");
   }
   fclose(fpcov);

   /* INVERT COV, NUMERICAL RECIPES IN C, PAGE 45 */

   COL = vector (1, nranks+1);
   indx = vectorI (1, nranks+1);
   ludcmp(COV, nranks+1, indx, &d);
   for (j = 1; j <= nranks+1; j++){
      for (i = 1; i <= nranks+1; i++){
          COL[i] = NULL_F;
      }
      COL[j] = 1.0;
      lubksb(COV, nranks+1,indx,COL);
      for (i = 1; i <= nranks+1; i++) ICOV[i][j] = COL[i];
   }

   /* AND CONTINUE WITH \chi^2 TEST:  COMPUTE THE QUADRATIC FORM */
  
   VECTOR = vector (1, nranks + 1);

   for (i = 1; i <= nranks+1; i++) {
       for (j = 1; j <= nranks+1; j++) {
           VECTOR[i] += ICOV[i][j] * ERROR[j];
       }
   }

   chisquare = NULL_F;
   for (i = 1; i <= nranks+1; i++){
         chisquare += (VECTOR[i] * ERROR[i]);
   }

   df = nranks + 1 - nparam;
 
   if (chisquare > NULL_F){
       q = gammq (0.5 * (double) df, 0.5 * chisquare);
   }

   fprintf(fpchi2, "\nX2(%d) = %10.2f\n", df, chisquare);
   fprintf(fpchi2, "p = %20.18f\n", q);
   fprintf(stderr, "\nX2(%d) = %10.2f, p = %15.12f, MSE = %f, rMSE = %f\n",   
           df, chisquare, q, mse/((double) nranks+2), 
           relmse/((double) nranks+2));

   /* CLEAN UP MEMORY */

   free_vector (VECTOR, 1, nranks+1);
   free_vector (COL, 1, nranks+1);
   free_vector (ERROR, 1, nranks+1);
   free_matrix (COV, 1, nranks+1, 1, nranks+1);
   free_matrix (ICOV, 1, nranks+1, 1, nranks+1);

   return (0);
}


void help ()
{
  fprintf (stderr,"lnreChi2 text_[SCZ|aZ].spc\n");
  fprintf (stderr,"OPTIONS:\n");
  fprintf (stderr,"     -h: display help\n");
  fprintf (stderr,"INPUT:\n");
  fprintf (stderr,"     text_[SCZ|aZ].spc      (m Vm EVm at N)\n");
  fprintf (stderr,"Other files that should be available:\n");
  fprintf (stderr,"     text_[SCZ|aZ].sp2      (m Vm EVm at 2N)\n");
  fprintf (stderr,"     text_[SCZ|aZ].ev2      (V(N), E[V(N)] and E[V(2N)])\n");
  fprintf (stderr,"OUTPUT:\n");
  fprintf (stderr,"     text_[SCZ|aZ].chi      goodness of fit statistics\n");
  fprintf (stderr,"     text_[SCZ|aZ].cov      the covariance matrix\n");
  exit (1);
}
