/*
 *	File: mcdisp.c
 *
 *      (C) IWTS
 *          KU Nijmegen
 *          The Netherlands
 *
 *      Author: R. Harald Baayen
 *
 *      History:
 *
 *      - feb 1997, version 1.0
 *
 *      Description:
 *
 *	mcdisp -kK -pP text.zvc text.spc
 *	  OPTIONS:
 *	    -k: K chunks   (default: 40)
 *	    -p: P permutations (default: 0)
 *	    -H: input files without headers
 *	  INPUT:
 *	     text.zvec:   Word z
 *	     text.spect:  m Vm
 *	  OUTPUT:
 *	     text.mcdisp
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "lex_cons.h"


/* EXTERN FUNCTIONS */

/* functions for numerical procedures */

extern void	randomize ();
extern double	uniformRandom ();
extern double	expdisp ();
extern double	vardisp ();

/* argument reading, file manipulation, and help function */

extern int	leesgetal ();
extern void	change_extension ();
extern void	help ();


/* GLOBAL VARIABLES */

double SPECTRUM[MAXM4][MAXM_VM],            /* frequency spectrum */
       proportion,                          /* proportion of extreme dispersions */
       Z;                                   /* Z-score */

FILE   *fpzvec,                             /* text.zvec */
       *fpspect,                            /* text.spect */
       *fpdisp,                             /* text.mcdisp */
       *fpfik;                              /* text.fik */

int    FREQBYCHUNK[MAXTYPES1][MAXCHUNKS4],  /* frequency by word for chunks */
       TEXT[MAXTOKENS1][2],                 /* text (in Zipf ranks), and chunk parentage */
       DISP[MAXTYPES1][MAXCHUNKS4],         /* for each type, count of dispersions d */
       OBSDISP[MAXTYPES1],                  /* empirical dispersion of types 1..V */
       OBSFREQ[MAXTYPES1],                  /* empirical frequencies of types 1..V */
       EXTREMEDISP[MAXTYPES1],              /* count of runs with dispersion < obsdisp */
       MAP[MAXM4],
       CHUNKS[MAXCHUNKS4],                  /* the K chunksizes */

       aantal,                              /* used by leesgetal() */
       getal,                               /* variable for scanf() */
       i, j, k, l,                          /* counters */
       p,                                   /* variable for permutation run */

       lines,                               /* number of lines in text.zvec */
       d,                                   /* dispersion */
       f,                                   /* frequency */
       n_ranks,                             /* number of ranks in spectrum */
       header,                              /* boolean on presence headers in input */

       /* CHUNKS */

       chunk,                               /* variables for determining the chunks */
       nchunks,                             /* number of chunks */
       chunksize,
       remainDer,

       /* TYPES, TOKENS */

       V, N, 
       token, type, 

       /* PERMUTATION VARIABLES */

       nperm,                               /* number of permutations */
       seed;                                /* seed for random generator */

char   WORDS[MAXTYPES1][MAXWORDLENGTH],     /* the list of word strings */
       woord[MAXWORDLENGTH],                /* variable for a word */
       new_name[MAXWORDLENGTH],             /* variables for extension handling */
       base_name[MAXWORDLENGTH],
       *fs;                                 /* char for scanning command line */



/* MAIN () */

int main (argc, argv)
int	argc;
char	*argv[];

{
   /* DEFAULTS */

   nperm = 5000;
   nchunks = 40;
   seed = 0;
   header = 1;

   /* HANDLING OF OPTIONS */

   while ((--argc > 0) && ((*++argv)[0] == '-')) {
        for (fs = argv[0] + 1; *fs != '\0'; fs++) {
            switch (*fs) {
            case 'h':
                help();
                break;
            case 'p':
                nperm = leesgetal (fs, &aantal);
                for (; aantal > 0; aantal--){
                   fs++;
                }
                break;
            case 'k':
                nchunks = leesgetal (fs, &aantal);
                for (; aantal > 0; aantal--){
                   fs++;
                }
                break;
            case 'H':      /* input files without headers! */
                header = 0;
                break;
            default:
                fprintf(stderr, "mcdisp: illegal option %c\n", *fs);
                exit(1);
                break;
            }
        }
   } /* of while */

   /* FILE HANDLING, FIRST THE TWO INPUT FILES */

   if (argc == 0) {
     help ();
   }

   if ((fpzvec = fopen(*argv, "r")) == NULL) {
      fprintf(stderr, "mcdisp: can't open input file %s\n", *argv);
      exit(1);
   }

   argv++; argc--;
   if ((fpspect = fopen(*argv, "r")) == NULL) {
      fprintf(stderr, "mcdisp: can't open spectrum file %s\n", *argv);
      exit(1);
   }

   /* NEXT THE OUTPUT FILES */

   strncpy(base_name, *argv, strlen(*argv) - 3);
   change_extension (base_name, new_name, "mcd");
   if ((fpdisp = fopen(new_name, "w")) == NULL){
      fprintf(stderr, "mcprofile: can't open output file %s\n", new_name);
      exit(1);
   }
   change_extension (base_name, new_name, "fik");
   if ((fpfik = fopen(new_name, "w")) == NULL){
      fprintf(stderr, "mcprofile: can't open output file %s\n", new_name);
      exit(1);
   }

   /* ================================================================= */
   /*                                                                   */
   /* LOAD THE SPECTRUM FILE                                            */
   /*                                                                   */
   /* ================================================================= */

   if (header){
       fscanf(fpspect, "%s", woord);   /* m */
       fscanf(fpspect, "%s", woord);   /* Vm */
   }
   i = 0;
   while (fscanf(fpspect, "%d %d", &token, &type) != EOF)  {
        SPECTRUM[i][0] = (double) token;
        SPECTRUM[i][1] = (double) type;
        i++;
   }
   n_ranks = i;

   /* CONSTRUCT AN ARRAY FOR GOING FROM A FREQUENCY TO THE INDEX IN SPECTRUM */

   for (i = 0; i < n_ranks; i++) MAP[(int)SPECTRUM[i][0]] = i;       

   /* AND CALCULATE NUMBER OF TYPES V AND NUMBER OF TOKENS V */

   N = 0; V = 0;
   for (i = 0; i < n_ranks; i++) {
       N += (int) (SPECTRUM[i][0] * SPECTRUM[i][1]);
       V += (int) SPECTRUM[i][1];
   }

   /* check text size (N, V) with respect to array bound */

   if (N > MAXTOKENS1){   
         fprintf(stderr, "mcdisp: ERROR: N = %d exceeds MAXTOKENS\n", N);
         exit(1);
   }
   if (V > MAXTYPES1){   
         fprintf(stderr, "mcdisp: ERROR: V = %d exceeds MAXTYPES\n", V);
         exit(1);
   }

   fprintf(stderr, "mcdisp: loaded frequency spectrum, N = %d\n", N);

   /* NEXT DETERMINE THE CHUNKSIZES  */

   chunksize = (int) floor(N/(nchunks*1.0));
   remainDer = N - (nchunks * chunksize);
   for (k = 1; k <= nchunks; k++) CHUNKS[k] = chunksize;
   for (k = 1; k <= remainDer; k++) CHUNKS[k]++;
   for (k = 2; k <= nchunks; k++) CHUNKS[k] += CHUNKS[k-1];
   CHUNKS[0] = 0;

   /* CALCULATE THE EXPECTATIONS AND STDEV'S USING BINOMIAL MODEL       */
   /* FOR THE WORDS WITH FREQUENCIES m = 1, ..., mmax                   */

   /* trivial for the hapax legomena */

   SPECTRUM[0][2] = 1.0; SPECTRUM[0][3] = NULL_F;

   /* and a bit more work for higher frequencies: */

   for (i = 1; i < n_ranks; i++){
        /* theoretical expressions are based on nchunks equal-sized chunks */
        SPECTRUM[i][2] = expdisp((double) nchunks, SPECTRUM[i][0]);
        SPECTRUM[i][3] = sqrt(vardisp((double) nchunks, SPECTRUM[i][0]));
   }
   
   /* OBSERVED DISPERSIONS */

   /* FIRST READ THE TEXT, USING THE ZIPF RANKS AS WORD IDENTIFIERS */

   /* TEXT is indexed from 1..N */

   if (header){
       fscanf(fpzvec, "%s", woord);   /* Word */
       fscanf(fpzvec, "%s", woord);   /* z */
   }
   lines = 0;
   while (fscanf(fpzvec, "%s %d ", woord, &getal) != EOF){
     lines++;
     TEXT[lines][0] = getal;
     strcpy(WORDS[getal], woord);
   }
   fclose(fpzvec);
   fprintf(stderr, "mcdisp: loaded text, N = %d\n", N);

   if (lines != N){
       fprintf(stderr, 
          "mcdisp: ERROR: mismatch between text.zvec and text.spectrum\n");
       exit(1);
   }
   
   /* add chunk number specification to each text position */

   for (k = 1; k <= nchunks; k++){
       for (i = CHUNKS[k-1]; i <= CHUNKS[k]; i++){
            TEXT[i][1] = k;
       }
   }

   /* CALCULATE THE EMPIRICAL DISPERSIONS */

   for (i = 1; i <= lines; i++) FREQBYCHUNK[TEXT[i][0]][TEXT[i][1]]++;

   for (i = 1; i <= V; i++){    /* the Zipf ranks 1..V */
       d = 0;      /* dispersion of word with zipf rank i */
       f = 0;      /* frequency of word with zipf rank i */
       for (j = 1; j <= nchunks; j++){
           if (FREQBYCHUNK[i][j] > 0) d++;
           f += FREQBYCHUNK[i][j];
       }
       OBSDISP[i] = d;
       OBSFREQ[i] = f;
   }

   /* AND PRINT THEM */

   fprintf(fpfik, "Word                        z    ");
   for (i = 1; i <= nchunks; i++) fprintf(fpfik, " k%d", i);
   fprintf(fpfik, "\n");

   for (i = 1; i <= V; i++){
      fprintf(fpfik, "%-20s %8d    ", WORDS[i], i);
      for (j = 1; j <= nchunks; j++){
            fprintf(fpfik, " %d", FREQBYCHUNK[i][j]);  
      }
      fprintf(fpfik, "\n");
   }
   fclose(fpfik);
   fprintf(stderr, "mcdisp: f(i,k) table completed\n");

   /* FINALLY, SET FREQBYCHUNK TO ZERO FOR PERMUTATION RUNS */

   for (i = 1; i <= V; i++){
      for (j = 1; j <= nchunks; j++){
           FREQBYCHUNK[i][j] = 0;  
      }
   }

   /* NEXT THE PERMUTATION RUNS */

   if (seed == 0) {
	   	seed = -1;
		uniformRandom(&seed);
   }

   for (p = 1; p <= nperm; p++){
          if (p % 10 == 0) fprintf(stderr, "%d\r", p);
          randomize(p, lines);   
		  /* different seeds for different runs no longer maintained, not
		   * necessary for this random generator */

          for (i = 1; i <= lines; i++) FREQBYCHUNK[TEXT[i][0]][TEXT[i][1]]++;
     
          for (i = 1; i <= V; i++){    
              d = 0;              /* dispersion of word with Zipf rank i */
              for (j = 1; j <= nchunks; j++){
                   if (FREQBYCHUNK[i][j] > 0) d++;
                   FREQBYCHUNK[i][j] = 0; /* initialize for next run */
              }
              DISP[i][d]++;   
          }
   }
   fprintf(stderr, "\nmcdisp: completed permutation runs\n");

   /* COUNT NUMBER OF RUNS FOR WHICH MONTE CARLO DISPERSION < OBSERVED DISP */

   for (i = 1; i <= V; i++) {   /* for each type */
      j = 1;
      while (j <= OBSDISP[i]){
          EXTREMEDISP[i]+=DISP[i][j];
          j++;
      }
   }

   /* SAVE RESULTS IN text.mcdisp */
   
   fprintf(fpdisp, "Word                        z   Frequency Obs Exp    StDev  Z        MCperc\n");
   for (i = 1; i <= V; i++) {      /* for all types */

       /* Z-score given Binomial model  */

       if (SPECTRUM[ MAP[OBSFREQ[i]] ][3] > NULL_F){
          Z = (OBSDISP[i] - SPECTRUM[ MAP[OBSFREQ[i]] ][2]) /
             sqrt(SPECTRUM[ MAP[OBSFREQ[i]] ][3]);
       }
       else{
          Z = NULL_F;
       }

       if (nperm == 0){
            proportion = NULL_F;
       }
       else{
            proportion = EXTREMEDISP[i]/(nperm*1.0);
       }
       fprintf(fpdisp, "%-20s %8d %8d %4d %6.2f %6.2f %6.2f %10.4f\n", 
         WORDS[i], i, OBSFREQ[i], OBSDISP[i], 
         SPECTRUM[MAP[ OBSFREQ[i] ]][2],   /* expected dispersion */
         SPECTRUM[MAP[ OBSFREQ[i] ]][3],   /* stdev of dispersion */
         Z,                                /* Z-score */
         proportion                        /* proportion more extreme disp */
       );
   }
   fprintf(stderr, "mcdisp: completed mcdisp table\n");

   return (0);
}


void randomize (s, n)
int s, n;

{
  int k, i, regel, tmp, r;

  k = n; r = 1;

  for (i = 1; i < n; i++) {
     regel = ((i-1) + ((int) floor( (k-i+1) * uniformRandom(&r) ) + 1));
     tmp = TEXT[regel][0];
     TEXT[regel][0] = TEXT[i][0];
     TEXT[i][0] = tmp;
  }
}


double expdisp (k, n)
double k, n;

{
  return (k * (1.0 - exp (n * log (1.0 - (1.0 / k)))));
}


double vardisp (k, n)
double k, n;

{
  return ((k * (k - 1.0) * exp (n * log (1.0 - (2.0 / k)))) +
          (k * exp (n * log (1.0 - (1.0 / k)))) -
          (k * k * exp (2.0 * n * log (1.0 - (1.0 / k)))));
}


void help()
{
  fprintf(stderr,"mcdisp -kK -pP -H text.zvc text.spc\n");
  fprintf(stderr,"      (k maximally %d, maximal text length %d)\n", 
                 MAXCHUNKS4-1, MAXTOKENS1-1);
  fprintf(stderr,"OPTIONS:\n"); 
  fprintf(stderr,"      -k: K chunks (default: 40)\n");
  fprintf(stderr,"      -p: P permutation runs (default: 0)\n");
  fprintf(stderr,"      -H: input files lack header (default: with header)\n");
  fprintf(stderr,"INPUT:\n");
  fprintf(stderr,"      text.zvc:   Word z\n");
  fprintf(stderr,"      text.spc:  m Vm\n");
  fprintf(stderr,"OUTPUT:\n"); 
  fprintf(stderr,"      text.mcd: observed and Monte Carlo dispersions\n");
  fprintf(stderr,"      text.fik: empirical chunk frequencies f(i,k)\n");

  exit(1);
}


