/*
 *	File: wfl2spc.c
 *
 *      (C) IWTS
 *          KU Nijmegen
 *          The Netherlands
 *
 *      Author: R. Harald Baayen
 *
 *      History:
 *
 *      - jul 1997, version 1.0
 *      - feb 2000, version 1.1
 *
 *      Description: converts word frequency list into spectrum and
 *      calculates various lexical `constants'
 *      Counts number of zero types and reports this on stdout, but
 *      does not include the zero ranks in the spectrum unless
 *      specifically requested by using the z option
 *
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "lex_cons.h"


/* EXTERN FUNCTIONS */

/* functions for numerical procedures */

extern double	getD ();
extern double	getE ();
extern double	getK ();
extern double	getZ ();
extern double	getlogMean ();
extern double	getlogStdDev ();
extern void	lognormal ();

extern double	fz ();
extern double	delta ();

extern void	SichelParam ();

extern void	showsummary ();      /* print summary file                  */
extern void	showwflist ();       /* print wflist file                   */
extern void	showspectrum ();     /* print spectrum file                 */
extern void	updatespectrum ();   /* update frequency spectrum           */
extern void	insertintoMLIJST (); /* insert new word freq into MLIJST    */
extern void	insert ();           /* insertion itself for previous       */
extern void	insertM ();          /* insertion itself for previous       */
extern void	constants ();        /* calculates constants from spectrum  */
extern int	find ();             /* find word type in WORDTYPES         */

/* functions for cleaning up the input, removing sgml codes, ... */

extern void	insertintotypelist ();
extern void	showtypelist ();

/* argument reading, file manipulation, and help function */

extern void	change_extension ();
extern void	help ();


/* GLOBAL VARIABLES */

double
        /* GLOBAL VARIABLES FOR PARAMETERS OF SICHEL'S MODEL */

        cur_c, cur_b, cur_a,

        /* PARAMETERS OF SICHEL'S MODEL, ESTIMATED FROM THE SAMPLE */

        smean, sstdev,
        PVM[MAXTYPES4],

        /* DATA STRUCTURE FOR OBSERVED PROFILE */

        tabel_orig[NMEASURES+1][MAXCHUNKS1+1];

FILE
        /* INPUT FILE */

        *fptext,                   /* input text */

        /* OUTPUT FILES */

        *fpwflist,                 /* word frequency list */
        *fpspectrum,               /* frequency spectrum */
        *fpsum;                    /* file with summary statistics */

int     i, j, k, l, aantal,        /* counters */
        m, Vm,                     /* tokens and types in wflist input file */

        /* BASIC DATA STRUCTURES */

        CHUNKS[MAXCHUNKS1+1],      /* the K chunksizes */
        TYPENLIJST[MAXTYPES4+1],   /* list of types */
        VM[MAXTYPES4+1],           /* frequency spectrum: V(m) */
        MLIJST[MAXTYPES4+1],       /* list of frequency ranks m */
        WORDFREQ[MAXTYPES4+1],     /* frequency counts for types */
        WORDINDEX[MAXTYPES4+1],    /* indices of frequencies */
        SORTFREQ[MAXTYPES4+1],     /* sorted frequencies, high to low */
        ZRANK[MAXTYPES4+1],        /* Zipf ranks */

        /* CHUNKS */

        chunk,                     /* variables for determining the chunks */
        nchunks,
        chunksize,
        remainDer,

        /* LEXICAL PARAMETERS */

        V, N,                      /* types, tokens */
        curN, curV,                /* types, tokens up to current chunk size */
        Nzero, Vzero,              /* empirical number of tokens, types */
        curV1, curV2,              /* hapaxes, disleg. up to current chunk */
        curV3, curV4, curV5,
        mMax, gmMax,               /* maximal (current) frequency rank */
        freq_a, freq_the,          /* current frequencies of these words */
        header,                    /* == 0 if input wfl does not have header */
        maantal,                   /* number of frequency ranks */
		includeZero,               /* boolean for including zero ranks */
		V0,                        /* number of types with frequency 0 */

        /* file and command line handling */

        verbose,                   /* boolean for print statements */
        c, lastchar,               /* character variabeles */
        ignore,                    /* boolean for dropping SGML mark-up */
        inheader,                  /* true if in header */
        divtagcount,               /* count for stack of incomplete tags */
        noextension,               /* boolean for presence of ".txt" extension */
        allfiles,                  /* boolean for printing text vectors */
        vecexists,                 /* text in vector format exists already */
        frequency,                 /* for scanning frequencies in input*/
  
        /* ZIPF, SICHEL */

        zipfzeros, sichelzeros;    /* counts for fitless spectra */

char    *s,                        /* scanning command line */

        /* variables for scanning and handling file input */

        beginheader[MAXWORDLENGTH],
        endheader[MAXWORDLENGTH],
        newword[MAXWORDLENGTH],
        testword[MAXWORDLENGTH],
        sdummy[MAXWORDLENGTH],
        word[MAXWORDLENGTH],

        /* variables for handling file name extensions */
 
        woord[MAXWORDLENGTH],      /* variable for a word */
        new_name[MAXWORDLENGTH],   /* variables for extension handling */
        base_name[MAXWORDLENGTH],
        fpvecname[MAXWORDLENGTH],

        /* THE BASIC LIST OF WORD TYPES */

        WORDTYPES[MAXTYPES4+1][MAXWORDLENGTH];


/* MAIN () */

int main (argc, argv)           
int     argc;
char   *argv[];

{
   /* DEFAULT SETTINGS */

   nchunks = 1;                    /* default number of chunks */
   zipfzeros = 0;                  /* count of missing fits Zipf */
   sichelzeros = 0;                /* count of missing fits Sichel */
   verbose = 0;                    /* default: no verbose mode */
   header= 1;                      /* default: file has a header */
   includeZero = 0;                /* default: don't include zero ranks */
   V0 = 0;

   while (--argc > 0 && (*++argv)[0] == '-') {
        for (s = argv[0] + 1; *s != '\0'; s++) {
            switch (*s) {
                   case 'h':   /* help */
                        help();
                        exit(1);
                        break;
                   case 'v':   /* extra print statements */
                        verbose = 1;
                        break;
                   case 'e':   /* file name does not have extension ".wfl" */
                        noextension = 1;
                        break;
                   case 'm':   /* file does not have a header (m Vm) */
                        header = 0;
                        break;
				   case 'z':
						includeZero = 1;
						break;
		           default: 
		                fprintf (stderr, "wfl2spc: illegal option %c\n", *s);
		                exit (1);
		                break;
	        } /* of switch */
	    }  /* of for */
   }  /* of while */

   /* FILE HANDLING */
  
   if (argc == 0) help();

   /* THE INPUT */

   if ((fpwflist = fopen (*argv, "r")) == NULL) {
        fprintf(stderr, "wfl2spc: can't open %s for reading\n", *argv);
        exit (1);
   }

   /* OUTPUT FILES */

   /* base_name is file name ".wfl" is stripped when present */
      
   if (noextension == 0){
       strncpy(base_name, *argv, strlen(*argv) - 4);
   }
   else{
       strcpy(base_name, *argv);
   }

   change_extension (base_name, new_name, ".spc");
   if ((fpspectrum = fopen(new_name, "w")) == NULL){
    fprintf(stderr, "wfl2spc: can't open textfile %s\n", new_name);
    exit(1);
   }

   change_extension (base_name, new_name, ".sum");
   if ((fpsum = fopen(new_name, "w")) == NULL){
    fprintf(stderr, "wfl2spc: can't open textfile %s\n", new_name);
    exit(1);
   }

   /* READ THROUGH INPUT FILE ONCE TO DETERMINE TEXT SIZE */

   /* read through data.wfl and build spectrum */

   if (header){
      fscanf(fpwflist, "%s", sdummy);  /* Word  */
      fscanf(fpwflist, "%s", sdummy);  /* Frequency */
   }

   V = 1;
   while (fscanf(fpwflist, "%s %d", word, &frequency) != EOF){
	   	if (frequency > 0) {
       		strcpy(WORDTYPES[V], word);
       		WORDFREQ[V] = frequency;
       		N+=WORDFREQ[V];
       		V++;
		} else {
			if (includeZero==1) {
       			strcpy(WORDTYPES[V], word);
       			WORDFREQ[V] = frequency;
       			N+=WORDFREQ[V];
       			V++;
			} else {
				V0++;
			}
		}
   }
   V--;
   if ((V0 > 0) && (includeZero==0)) {
	   	fprintf(stderr, "V0 = %d\n", V0);
   }
   if (verbose) showwflist(stderr);
   updatespectrum(V);
   constants(1);

   showspectrum(fpspectrum);
   fclose(fpspectrum);
   fprintf(stderr, "printed frequency spectrum\n");

   showsummary(fpsum, argv);
   fclose(fpsum);
   fprintf(stderr, "printed summary statistics\n");

   return (0);
}


void showsummary(bestand, filename)
FILE *bestand;
char *filename[];
{
   fprintf (bestand, "Summary statistics for %s\n\n", *filename);
   fprintf (bestand, "N         %25d\n", N);
   fprintf (bestand, "K         %25.15f\n", tabel_orig[1][nchunks]);
   fprintf (bestand, "D         %25.15f\n", tabel_orig[2][nchunks]); 
   fprintf (bestand, "V(N)      %25d\n", (int) tabel_orig[3][nchunks]);
   fprintf (bestand, "V(1,N)    %25d\n", (int) tabel_orig[4][nchunks]);
   fprintf (bestand, "V(2,N)    %25d\n", (int) tabel_orig[5][nchunks]);
   fprintf (bestand, "V(3,N)    %25d\n", (int) tabel_orig[6][nchunks]);
   fprintf (bestand, "V(4,N)    %25d\n", (int) tabel_orig[7][nchunks]);
   fprintf (bestand, "V(5,N)    %25d\n", (int) tabel_orig[8][nchunks]);
   fprintf (bestand, "R         %25.15f\n", tabel_orig[9][nchunks]);
   fprintf (bestand, "W         %25.15f\n", tabel_orig[10][nchunks]);
   fprintf (bestand, "S         %25.15f\n", tabel_orig[11][nchunks]);
   fprintf (bestand, "H         %25.15f\n", tabel_orig[12][nchunks]);
   fprintf (bestand, "C         %25.15f\n", tabel_orig[13][nchunks]);
   fprintf (bestand, "E         %25.15f\n", tabel_orig[14][nchunks]);
   fprintf (bestand, "logM      %25.15f\n", tabel_orig[15][nchunks]);
   fprintf (bestand, "logStd    %25.15f\n", tabel_orig[16][nchunks]);
   fprintf (bestand, "b         %25.15f\n", tabel_orig[17][nchunks]);
   fprintf (bestand, "c         %25.15f\n", tabel_orig[18][nchunks]);
   fprintf (bestand, "a1        %25.15f\n", tabel_orig[19][nchunks]);
   fprintf (bestand, "Z         %25.15f\n", tabel_orig[20][nchunks]);
   fprintf (bestand, "f(a,N)    %25d\n", (int) tabel_orig[21][nchunks]);
   fprintf (bestand, "f(the,N)  %25d\n", (int) tabel_orig[22][nchunks]);
   fprintf (bestand, "sLmean    %25.15f\n", tabel_orig[23][nchunks]);
   fprintf (bestand, "sLstdev   %25.15f\n", tabel_orig[24][nchunks]);
   fprintf (bestand, "pHL       %25.15f\n", tabel_orig[25][nchunks]);
}


int find (wrd)
char wrd[MAXWORDLENGTH];
{
    
 int found, begin, end, mid, cmp, interval, addition, retValue;

 found = 0; begin = 1; end = V;
 retValue = 0;

 while (found == 0){
    if (end > begin+1){
        interval = end - (begin - 1);
        addition = interval/2;
        mid = begin+addition;
        cmp = strcmp(wrd, WORDTYPES[mid]);
        if (cmp > 0){
            begin=mid;
        }
        else{
            if (cmp < 0){
                end = mid;
            }
            else{ /* found */
				found = 1;
				retValue = mid;
            }
        }
    }
    else{
        cmp = strcmp(wrd, WORDTYPES[begin]);
        if (cmp < 0){
            fprintf(stderr, "spectrum: %s not in TYPELIST\n", word);
            exit(1);
        }
        else{
            if (cmp == 0){
				found = 1;
				retValue = begin;
            }
            else{
               cmp = strcmp(wrd, WORDTYPES[end]);
               if (cmp < 0){
                   fprintf(stderr, "spectrum: %s not in TYPELIST\n", word);
                   exit(1);
               }
               else{
                   if (cmp==0){
					   found = 1;
					   retValue = end;
                   }
                   else{
                        fprintf(stderr, "spectrum: %s not in TYPELIST\n", word);
                        exit(1);
                   }
               }
            }
        }
    }
 }
 
 return retValue;
}


void showwflist(bestand)
FILE *bestand;
{
 int i;
 fprintf(bestand, "Word Frequency\n");
 for (i = 1; i <= V; i++){
    fprintf(bestand, "%s %d\n", WORDTYPES[i], WORDFREQ[i]);
 }
}


void insertintotypelist(wrd)   /* binary search */
char wrd[MAXWORDLENGTH];
{
 int found, begin, end, mid, cmp, interval, addition;

 found = 0; begin = 1; end = V; 

 while (found == 0){
    if (end > begin+1){
        interval = end - (begin - 1);
        addition = interval/2;
        mid = begin+addition;
        cmp = strcmp(wrd, WORDTYPES[mid]);
        if (cmp > 0){
            begin=mid;
        }
        else{
            if (cmp < 0){
                end = mid;
            }
            else{ /* found */
                WORDFREQ[mid]++;
                found = 1;
            }
        }
    }
    else{
        cmp = strcmp(wrd, WORDTYPES[begin]);
        if (cmp < 0){
            insert(wrd, begin);
            found = 1;
        }
        else{
            if (cmp == 0){
               WORDFREQ[begin]++;
               found = 1;
            }
            else{
               cmp = strcmp(wrd, WORDTYPES[end]);
               if (cmp < 0){
                   insert(wrd,end);
                   found=1;
               }
               else{
                   if (cmp==0){
                        WORDFREQ[end]++;
                        found=1;
                   }
                   else{
                        insert(wrd, end+1);
                        found = 1;
                   }
               }
            }
        }
    }
 }   
}


void insert(woord, pos)
char woord[MAXWORDLENGTH];
int pos;
{
 int ii;
 for (ii = V+1; ii > pos; ii--){
     strcpy(WORDTYPES[ii], WORDTYPES[ii-1]);
     WORDFREQ[ii] = WORDFREQ[ii-1];
 }
 strcpy(WORDTYPES[pos], woord);
 WORDFREQ[pos] = 1;

 V++;
}


void showtypelist(wrd, v)
char wrd[MAXWORDLENGTH];
int v;
{
 int iii;
 for (iii = 1; iii <= v; iii++){
     printf("%-20s %10d      [%s]\n", WORDTYPES[iii], WORDFREQ[iii], wrd);
 }
 printf("-------------------\n");
}


void help ()
{
  fprintf (stderr,"wfl2spc -e -m data.wfl\n");
  fprintf (stderr,"OPTIONS:\n");
  fprintf (stderr,"      -e: input file does not have .wfl extension\n");
  fprintf (stderr,"      -m: input file does not have a header\n");
  fprintf (stderr,"INPUT:\n");
  fprintf (stderr,"      data.wfl: m Vm\n");
  fprintf (stderr,"OUTPUT:\n");
  fprintf (stderr,"      data.spc: frequency spectrum\n");
  fprintf (stderr,"      data.sum: summary statistics\n");
  exit (1);
}

/* =====================================================================*/


void SichelParam ()    /* estimates parameters for Gamma fixed at -0.5 */
{
    double upper, lower, stepsize, x, y, z, theta,alpha;
    int slecht, iterations;

    upper = ((double)curN)/((double)curV1);
    lower = NULL_F;
    stepsize = ((double)curN/(double)curV1)/1000.0;
    x = upper;
    slecht = 0;
    iterations = 0;

    while (delta (x, (double) curN, (double) curV, (double) curV1) > EPSILON) {
        iterations++;
        upper = x;
        x -= stepsize;
        if (x > NULL_F) {
           if (delta (x, (double) curN, (double) curV, (double) curV1) < NULL_F) {
               x = upper;
               stepsize /= 1000.0;
           }
        }
        if (fz (x, (double) curN, (double) curV, (double) curV1) < NULL_F) {
             x = upper;
             slecht = 1;
             break;
        }
        if (iterations > MAXITERATIONS){
             x = upper;
             slecht = 1;
             break;
        }
    }
    if (x < MAXX){ 
    z = x;
    y = ((double)curV1/(double)curN)*z;
    theta = (1.0 - (y*y));
    alpha = ((2.0 * curV1)/(1.0*curV*theta))*
             (((1.0*curN*y)/((double) curV1))-1.0);

    cur_c = theta/((double)curN*(1.0-theta));
    cur_b = alpha/sqrt(1.0 + (cur_c * (double)curN));
    cur_a =  (cur_b*(double)curN)/
              ((2.0/cur_c) * sqrt(1.0 + ((double)curN * cur_c)) *
              (exp( cur_b * (sqrt(1.0 + ((double)curN * cur_c))-1.0))-1.0));
    }
    else slecht = 1;

    if (slecht){
      cur_c = 0; cur_b = 0; cur_a = 0; sichelzeros++;
    }

}


void constants (j)
int	j;

{
      curN = N;

      /* calculate constants */

      tabel_orig[1][j] = getK (maantal, VM, MLIJST, curN);
      tabel_orig[2][j] = getD (maantal, VM, MLIJST, curN);
      tabel_orig[3][j] = (double) curV;
      tabel_orig[4][j] = (double) curV1;
      tabel_orig[5][j] = (double) curV2;
      tabel_orig[6][j] = (double) curV3;
      tabel_orig[7][j] = (double) curV4;
      tabel_orig[8][j] = (double) curV5;
      tabel_orig[9][j] = ((double) curV)/sqrt(curN);
      tabel_orig[10][j] = exp (exp (-0.17 * log(curV)) * log (curN));
      tabel_orig[11][j] = (double) curV2 / (double) curV;
      tabel_orig[12][j] = 100.0 * log((double)curN)/
                      (1.0 - ((double)curV1/(double)curV));
      tabel_orig[13][j] = log (curV) / log (curN);
      tabel_orig[14][j] = getE (maantal, VM, MLIJST, curN);
      tabel_orig[15][j] = getlogMean (maantal, VM, MLIJST, curV);
      tabel_orig[16][j] = getlogStdDev (tabel_orig[15][j], maantal, VM, MLIJST, curV);

      /* calculates the parameters for Sichel */
      SichelParam ();  
      tabel_orig[17][j] = cur_b;
      tabel_orig[18][j] = cur_c;
      tabel_orig[19][j] = cur_a;
      tabel_orig[20][j] = getZ ((double) mMax / (double) curN, curN, curV, &zipfzeros);
      tabel_orig[21][j] = 0;
      tabel_orig[22][j] = 0;

      /* calculate parameters of Carroll's model, rough sample estimates */
      lognormal (maantal, PVM, VM, MLIJST, curN, &smean, &sstdev);
      tabel_orig[23][j] = smean;
      tabel_orig[24][j] = sstdev;
}


void updatespectrum(eind)    /* build spectrum from wflist */
int eind;               /* first, last position to take into account */
{
  int i; 
  
  curV = eind;          /* current number of types */
  
  MLIJST[1] = WORDFREQ[1];
  VM[1] = 1;
  maantal = 1;
  for (i = 2; i <= curV; i++){   
       insertintoMLIJST(WORDFREQ[i], i);
  }

  /* this cumbersome loop allows for the possibility of gaps early on
     in the spectrum */
  curV1 = 0; curV2 = 0; curV3 = 0; curV4 = 0; curV5 = 0;
  for (i = 1; i <= 5; i++){
    if (MLIJST[i]==1) curV1 = VM[i];  /* current number of hapaxes */
    if (MLIJST[i]==2) curV2 = VM[i];  /* current number of dislegomena */
    if (MLIJST[i]==3) curV3 = VM[i];  
    if (MLIJST[i]==4) curV4 = VM[i]; 
    if (MLIJST[i]==5) curV5 = VM[i];
  }
  mMax = MLIJST[maantal];
}


void showspectrum(bestand)
FILE *bestand;
{
 int i;
 fprintf(bestand, "m Vm\n");
 for (i = 1; i <= maantal; i++){
     fprintf(bestand, "%d %d\n", MLIJST[i], VM[i]);
 }
}


void insertintoMLIJST(frq, tracepos)   /* binary search */
int frq, tracepos;
{
 int found, begin, end, mid, interval, addition;

 found = 0; begin = 1; end = maantal; mid = 0;

 while (found == 0){
    if (end > begin+1){
        interval = end - (begin - 1);
        addition = interval/2;
        mid = begin+addition;
        if (frq > MLIJST[mid]){
            begin=mid;
        }
        else{
            if (frq < MLIJST[mid]){
                end = mid;
            }
            else{ /* found */
                VM[mid]++;
                found = 1;
            }
        }
    }
    else{
        if (frq < MLIJST[begin]){
            insertM(frq, begin, tracepos);
            found = 1;
        }
        else{
            if (frq == MLIJST[begin]){
               VM[begin]++;
               found = 1;
            }
            else{
               if (frq < MLIJST[end]){
                   insertM(frq,end, tracepos);
                   found=1;
               }
               else{
                   if (frq==MLIJST[end]){
                        VM[end]++;
                        found=1;
                   }
                   else{
                        insertM(frq, end+1, tracepos);
                        found = 1;
                   }
               }
            }
        }
    }
 }   
}


void insertM(freq, pos, tracepos)
int freq, pos, tracepos;
{
 int ii;

 for (ii = maantal+1; ii > pos; ii--){
     MLIJST[ii] = MLIJST[ii-1];
     VM[ii] = VM[ii-1];
 }

 MLIJST[pos] = freq;
 VM[pos] = 1;

 maantal++;
}
