/*
 *	File: spectrum.c
 *
 *      (C) IWTS
 *          KU Nijmegen
 *          The Netherlands
 *
 *      Author: R. Harald Baayen
 *
 *      History:
 *
 *      - mar 1997, version 1.0
 *      - may 1999, version 1.1  
 *      - june 2000, version 1.2  
 *
 *      Description:
 *
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
#include "lex_cons.h"


/* EXTERN FUNCTIONS */

/* functions for numerical procedures */

extern double	getD ();
extern double	getE ();
extern double	getK ();
extern double	getZ ();
extern double	getlogMean ();
extern double	getlogStdDev ();
extern void	lognormal ();

extern double	fz ();
extern double	delta ();

extern void	qcksrt2 ();

extern void	print_tabel ();
extern void	SichelParam ();

extern void	checkthea ();        /* updates frequencies of the and a    */
extern void	showzvec ();         /* print the text in zvec form         */
extern void	showzrank ();        /* print rank-frequency file           */
extern void	showsummary ();      /* print summary file                  */
extern void	showwflist ();       /* print wflist file                   */
extern void	showspectrum ();     /* print spectrum file                 */
extern void	updatewflist ();     /* update wflist with next chunk       */
extern void	updatespectrum ();   /* update frequency spectrum           */
extern void	insertintoMLIJST (); /* insert new word freq into MLIJST    */
extern void	insertM ();          /* insertion itself for previous       */
extern void	constants ();        /* calculates constants from spectrum  */
extern int	find ();             /* find word type in WORDTYPES         */

/* functions for cleaning up the input, removing sgml codes, ... */

extern int	gettextsize ();
extern void	developmentprofile ();
extern void	insertintotypelist ();
extern void	showtypelist ();
extern int	readword ();
extern int	isfulltag ();
extern int	isbegintag ();
extern int	isendtag ();
extern int	istag ();

/* argument reading, file manipulation, and help function */

extern int	leesgetal ();
extern void	change_extension ();
extern void	help ();


/* GLOBAL VARIABLES */

double
        /* GLOBAL VARIABLES FOR PARAMETERS OF SICHEL'S MODEL */

        cur_c, cur_b, cur_a1,

        /* PARAMETERS OF SICHEL'S MODEL, ESTIMATED FROM THE SAMPLE */

        smean, sstdev,
        PVM[MAXTYPES3],

        /* DATA STRUCTURE FOR OBSERVED PROFILE */

        tabel_orig[NMEASURES+1][MAXCHUNKS2+1];


FILE
        /* INPUT FILE */

        *fptext,                    /* input text */

        /* OUTPUT FILES */

        *fpvec,                     /* text in zvec format */
        *fpzvec,                    /* text in zvec format */
        *fpwflist,                  /* word frequency list */
        *fpspectrum,                /* frequency spectrum */
        *fpzrank,                   /* rank-frequency list */
        *forig,                     /* developmental profile */
        *fpsum;                     /* file with summary statistics */

int     i, j, k, l, aantal,         /* counters */

        /* BASIC DATA STRUCTURES */

        CHUNKS[MAXCHUNKS2+1],       /* the K chunksizes */
        TYPENLIJST[MAXTYPES3+1],    /* list of types */
        VM[MAXTYPES3+1],            /* frequency spectrum: V(m) */
        MLIJST[MAXTYPES3+1],        /* list of frequency ranks m */
        WORDFREQ[MAXTYPES3+1],      /* frequency counts for types */
        WORDINDEX[MAXTYPES3+1],     /* indices of frequencies */
        SORTFREQ[MAXTYPES3+1],      /* sorted frequencies, high to low */
        ZRANK[MAXTYPES3+1],         /* Zipf ranks */

        /* CHUNKS */

        chunk,                      /* variables for determining the chunks */
        nchunks,
        chunksize,
        remainDer,

        /* LEXICAL PARAMETERS */

        V, N,                       /* types, tokens */
        curN, curV,                 /* types, tokens up to current chunk size */
        Nzero, Vzero,               /* empirical number of tokens, types */
        curV1, curV2,               /* hapaxes, disleg. up to current chunk */
        curV3, curV4, curV5,
        mMax, gmMax,                /* maximal (current) frequency rank */
        freq_a, freq_the,           /* current frequencies of these words */
        maantal,                    /* number of frequency ranks */

        /* file and command line handling */

        verbose,                    /* boolean for print statements */
        c, lastchar,                /* character variabeles */
        ignore,                     /* boolean for dropping SGML mark-up */
        inheader,                   /* true if in header */
        divtagcount,                /* count for stack of incomplete tags */
        noextension,                /* boolean for presence of ".txt" extension */
        allfiles,                   /* boolean for printing text vectors */
        vecexists,                  /* text in vector format exists already */
        removeSgml,                 /* boolean for removing sgml code */
        stringPlease,               /* just leave strings unaffected in 
									   readword */

        /* ZIPF, SICHEL */

        zipfzeros, sichelzeros;     /* counts for fitless spectra */

char    *s,                         /* scanning command line */

        /* variables for scanning and handling file input */

        beginheader[MAXWORDLENGTH2],
        endheader[MAXWORDLENGTH2],
        newword[MAXWORDLENGTH2],
        testword[MAXWORDLENGTH2],
        word[MAXWORDLENGTH2],

        /* variables for handling file name extensions */
 
        woord[MAXWORDLENGTH2],       /* variable for a word */
        new_name[MAXWORDLENGTH2],    /* variables for extension handling */
        base_name[MAXWORDLENGTH2],
        fpvecname[MAXWORDLENGTH2],

        /* THE BASIC LIST OF WORD TYPES */

        WORDTYPES[MAXTYPES3+1][MAXWORDLENGTH2];


/* MAIN () */

int main (argc, argv)           
int     argc;
char   *argv[];

{
   /* DEFAULT SETTINGS */

   nchunks = DEF_CHUNKS;            /* default number of chunks */
   zipfzeros = 0;                   /* count of missing fits Zipf */
   sichelzeros = 0;                 /* count of missing fits Sichel */
   verbose = 0; 
   vecexists = 0;                   /* text exists already in vector format */
   noextension = 0;                 /* default: file has ".txt" extension */
   allfiles = 1;                    /* default: print all files */
   removeSgml = 1;                  /* remove Sgml code by default */
   stringPlease = 0;                /* remove punctation marks by default */

   strcpy(beginheader, "<bncdoc");  /* note: lower case ! */
   strcpy(endheader, "<header>");   /* note: / is ignored already */

   while (--argc > 0 && (*++argv)[0] == '-') {
        for (s = argv[0] + 1; *s != '\0'; s++) {
            switch (*s) {
                   case 'h':   /* help */
                        help();
                        exit(1);
                        break;
                   case 'v':   /* extra print statements */
                        verbose = 1;
                        break;
                   case 'e':   /* file name does not have extension ".txt" */
                        noextension = 1;
                        break;
                   case 's':   /* suppress printing text vectors */
                        allfiles = 0;
                        break;
                   case 'n':   /* do not attempt to remove Sgml code */
                        removeSgml = 0;
                        break;
                   case 'y':   /* text vector exists */
                        vecexists = 1;
                        break;
                   case 'p':   /* literal strings please */
                        stringPlease = 1;
                        break;
                   case 'k':
                        nchunks = leesgetal (s, &aantal);
                        for (; aantal > 0; aantal--){
                            s++;
                        }
                        break;
		           default: 
		                fprintf (stderr, "spectrum: illegal option %c\n", *s);
		                exit (1);
		                break;
	        } /* of switch */
	    }  /* of for */
   }  /* of while */


   /* FILE HANDLING */
  
   if (argc == 0) {
     help ();
   }

   /* THE INPUT */

   if ((fptext = fopen (*argv, "r")) == NULL) {
        fprintf(stderr, "spectrum: can't open %s for reading\n", *argv);
        exit (1);
   }

   /* OBLIGATORY OUTPUT FILES */

   /* base_name is file name ".txt" is stripped when present */
      
   if (noextension == 0){
       strncpy(base_name, *argv, strlen(*argv) - 4);
   }
   else{
       strcpy(base_name, *argv);
   }

   change_extension (base_name, new_name, ".vec");
   strcpy(fpvecname,new_name);
   if (vecexists){
      if ((fpvec = fopen(new_name, "r")) == NULL){
         fprintf(stderr, "spectrum: can't open textfile %s\n", new_name);
         exit(1);
      }
   }
   else{
      if ((fpvec = fopen(new_name, "w")) == NULL){
         fprintf(stderr, "spectrum: can't open textfile %s\n", new_name);
         exit(1);
      }
   }
 
   change_extension (base_name, new_name, ".obs");
   if ((forig = fopen(new_name, "w")) == NULL){
    fprintf(stderr, "spectrum: can't open textfile %s\n", new_name);
    exit(1);
   }

   change_extension (base_name, new_name, ".wfl");
   if ((fpwflist = fopen(new_name, "w")) == NULL){
    fprintf(stderr, "spectrum: can't open textfile %s\n", new_name);
    exit(1);
   }

   change_extension (base_name, new_name, ".spc");
   if ((fpspectrum = fopen(new_name, "w")) == NULL){
    fprintf(stderr, "spectrum: can't open textfile %s\n", new_name);
    exit(1);
   }

   change_extension (base_name, new_name, ".sum");
   if ((fpsum = fopen(new_name, "w")) == NULL){
    fprintf(stderr, "spectrum: can't open textfile %s\n", new_name);
    exit(1);
   }

   /* READ THROUGH INPUT FILE ONCE TO DETERMINE TEXT SIZE */

   if (vecexists){
       Nzero = 0;
       while (fscanf(fpvec, "%s", newword) != EOF){
           Nzero++;
       }
       rewind(fpvec);
   }
   else{
       Nzero = gettextsize();
   }
   fprintf(stderr, "spectrum: completed first scan of %s: N = %d\n", *argv, Nzero);

   /* OPTIONAL OUTPUT FILES: THE BIG TEXT VECTOR FILES */

   if (allfiles){
        change_extension (base_name, new_name, ".zvc");
        if ((fpzvec = fopen(new_name, "w")) == NULL){
           fprintf(stderr, "spectrum: can't open textfile %s\n", new_name);
           exit(1);
        }
        change_extension (base_name, new_name, ".zrk");
        if ((fpzrank = fopen(new_name, "w")) == NULL){
           fprintf(stderr, "spectrum: can't open textfile %s\n", new_name);
           exit(1);
        }
   }   

   /* DETERMINE THE SUCCESSIVE CHUNKS */

   chunksize = (int) floor(Nzero/(nchunks*1.0));
   remainDer = Nzero - (nchunks * chunksize);
   for (k = 1; k <= nchunks; k++) CHUNKS[k] = chunksize;
   for (k = 1; k <= remainDer; k++) CHUNKS[k]++;
   for (k = 2; k <= nchunks; k++) CHUNKS[k] += CHUNKS[k-1];
   CHUNKS[0] = 0;

   /* read through file and update spectrum */

   fprintf(stderr, "spectrum: calculating developmental profile of %s\n",*argv);

   N = 0; V = 0; k = 1;
   developmentprofile();

   fprintf(stderr, "spectrum: completed second scan of %s: N = %d  V = %d\n", *argv, N, V);

   print_tabel(forig, tabel_orig);
   fclose(forig);
   fprintf(stderr, "printed developmental profile\n");

   showwflist(fpwflist);
   fclose(fpwflist);
   fprintf(stderr, "printed word frequency list\n");

   showspectrum(fpspectrum);
   fclose(fpspectrum);
   fprintf(stderr, "printed frequency spectrum\n");

   showsummary(fpsum, argv);
   fclose(fpsum);
   fprintf(stderr, "printed summary statistics\n");

   if (allfiles){
      for (i = 1; i <= V; i++){
          WORDINDEX[i] = i;
          SORTFREQ[i] = WORDFREQ[i];
      }
      fprintf(stderr, "sorting word frequency list\n");
      qcksrt2(V, SORTFREQ, WORDINDEX);

      showzrank(fpzrank);
      fclose(fpzrank);

      j = 0;
      for (i = V; i > 0; i--){
          j++;
          ZRANK[WORDINDEX[i]] = j;
      }

      rewind(fpvec);

      fprintf(stderr, "scanning text and printing in zvec format\n");

      showzvec(fpzvec);
      fclose(fpzvec);
   }

   return (0);
}

void showzrank(bestand)
FILE *bestand;
{
 int i, j;

 fprintf (bestand, "z fz\n");
 j = 0;
 for (i = V; i > 0; i--){
      j++;
      fprintf(bestand, "%d %d\n", j, SORTFREQ[i]);
 }

}

void showsummary(bestand, filename)
FILE *bestand;
char *filename[];
{
   fprintf(bestand, "Summary statistics for %s\n\n", *filename);
   fprintf(bestand, "N         %25d\n", N);
   fprintf(bestand, "K         %25.15f\n", tabel_orig[1][nchunks]);
   fprintf(bestand, "D         %25.15f\n", tabel_orig[2][nchunks]); 
   fprintf(bestand, "V(N)      %25d\n", (int) tabel_orig[3][nchunks]);
   fprintf(bestand, "V(1,N)    %25d\n", (int) tabel_orig[4][nchunks]);
   fprintf(bestand, "V(2,N)    %25d\n", (int) tabel_orig[5][nchunks]);
   fprintf(bestand, "V(3,N)    %25d\n", (int) tabel_orig[6][nchunks]);
   fprintf(bestand, "V(4,N)    %25d\n", (int) tabel_orig[7][nchunks]);
   fprintf(bestand, "V(5,N)    %25d\n", (int) tabel_orig[8][nchunks]);
   fprintf(bestand, "R         %25.15f\n", tabel_orig[9][nchunks]);
   fprintf(bestand, "W         %25.15f\n", tabel_orig[10][nchunks]);
   fprintf(bestand, "S         %25.15f\n", tabel_orig[11][nchunks]);
   fprintf(bestand, "H         %25.15f\n", tabel_orig[12][nchunks]);
   fprintf(bestand, "C         %25.15f\n", tabel_orig[13][nchunks]);
   fprintf(bestand, "E         %25.15f\n", tabel_orig[14][nchunks]);
   fprintf(bestand, "logM      %25.15f\n", tabel_orig[15][nchunks]);
   fprintf(bestand, "logStd    %25.15f\n", tabel_orig[16][nchunks]);
   fprintf(bestand, "b         %25.15f\n", tabel_orig[17][nchunks]);
   fprintf(bestand, "c         %25.15f\n", tabel_orig[18][nchunks]);
   fprintf(bestand, "a1        %25.15f\n", tabel_orig[19][nchunks]);
   fprintf(bestand, "Z         %25.15f\n", tabel_orig[20][nchunks]);
   fprintf(bestand, "f(a,N)    %25d\n", (int) tabel_orig[21][nchunks]);
   fprintf(bestand, "f(the,N)  %25d\n", (int) tabel_orig[22][nchunks]);
   fprintf(bestand, "sLmean    %25.15f\n", tabel_orig[23][nchunks]);
   fprintf(bestand, "sLstdev   %25.15f\n", tabel_orig[24][nchunks]);
}


void showzvec(bestand)
FILE *bestand;
{
    fprintf(bestand, "Word z\n");
    while (fscanf(fpvec, "%s", newword) != EOF){
        fprintf(bestand, "%s %d\n", newword, ZRANK[find(newword)]);
    }
}


int find (wrd)
char wrd[MAXWORDLENGTH2];
{
    
 int found, begin, end, mid, cmp, interval, addition, retValue;

 found = 0; begin = 1; end = V;
 retValue = 0;

 while (found == 0){
    if (end > begin+1){
        interval = end - (begin - 1);
        addition = interval/2;
        mid = begin+addition;
        cmp = strcmp(wrd, WORDTYPES[mid]);
        if (cmp > 0){
            begin=mid;
        }
        else{
            if (cmp < 0){
                end = mid;
            }
            else{ /* found */
				found = 1;
				retValue = mid;
            }
        }
    }
    else{
        cmp = strcmp(wrd, WORDTYPES[begin]);
        if (cmp < 0){
            fprintf(stderr, "spectrum: %s not in TYPELIST\n", word);
            exit(1);
        }
        else{
            if (cmp == 0){
				found = 1;
				retValue = begin;
            }
            else{
               cmp = strcmp(wrd, WORDTYPES[end]);
               if (cmp < 0){
                   fprintf(stderr, "spectrum: %s not in TYPELIST\n", word);
                   exit(1);
               }
               else{
                   if (cmp==0){
					   found = 1;
					   retValue = end;
                   }
                   else{
                        fprintf(stderr, "spectrum: %s not in TYPELIST\n", word);
                        exit(1);
                   }
               }
            }
        }
    }
 }
 
 return retValue;
}


int gettextsize ()
{
    ignore = 0; lastchar = '\0'; inheader = 0; N = 1;
    while (readword()){

       /* first check header status */

       if (strcmp(newword, beginheader)==0){
          inheader = 1;
       }
       else{
          if (strcmp(newword, endheader)==0){
              inheader = 0;
          }
       }

       /* next get rid of tag information */

       if (isbegintag(newword)) divtagcount++;
       if (isendtag(newword)) divtagcount--;

       if (removeSgml==1) {
         if ((inheader == 0) && (istag(newword)==0) && (divtagcount == 0)){
            fprintf(fpvec, "%s\n", newword);
            N++;
         }
       } else {
         if (removeSgml == 0) {
            fprintf(fpvec, "%s\n", newword);
            N++;
         }
       }
    }
    N--;
    fclose(fpvec);
    if ((fpvec = fopen(fpvecname, "r")) == NULL){
       fprintf(stderr, "spectrum: can't open %s for reading\n", fpvecname);
       exit(1);
    }

    return(N);
}


void updatewflist(woord)
char woord[MAXWORDLENGTH2];
{
 N++;
 if (N > 1){
   insertintotypelist(woord);
 }
 else{
   strcpy(WORDTYPES[1], woord);
   WORDFREQ[1] = 1;
   V++;
   if (V >= MAXTYPES3){
       fprintf(stderr, "spectrum: vocabulary size exceeds array bounds!\n");
       exit(1);
   }
   checkthea(woord);
 }
}


void checkthea(wrd)
char wrd[MAXWORDLENGTH2];
{
   if (strcmp(wrd, "the")==0) {
      freq_the++;
   }
   else{
      if (strcmp(wrd, "a")==0) freq_a++;
   }
}


void showwflist(bestand)
FILE *bestand;
{
 int i;
 fprintf(bestand, "Word Frequency\n");
 for (i = 1; i <= V; i++){
    fprintf(bestand, "%s %d\n", WORDTYPES[i], WORDFREQ[i]);
 }
}


void developmentprofile()
{
    N = 0;
    while (fscanf(fpvec, "%s", newword) != EOF){
          updatewflist(newword);
          if (N == CHUNKS[k]){
              fprintf(stderr, "[%d]", k);
              if (verbose) showwflist(stderr);
              updatespectrum(V);
              if (verbose) showspectrum(stderr);
              constants(k);
              k++;
          }
    }
    rewind(fpvec); 
    fprintf(stderr, "\n");
}


int readword()
{
  int		i, c;
  if (stringPlease == 1) {
  /*
    i = 0;
    while ((c = getc(fptext)) != EOF) {
       newword[i] = c;
       if (isspace(c)){
           newword[i] = '\0';
		   i = 0;
		   return(1);
	   } else {
       	   i++;
	   }
	}
	if (i > 0) {
		newword[i] = '\0';
		return(1);
	} else {
		return(0);
	}
	*/
	  fprintf(stderr, "To be continued\n");
	  exit(1);
  } else {
    i = 0;
    while ((c = getc(fptext)) != EOF) {
       if (isalpha (c) || isdigit (c)){
          newword[i] = tolower(c);
          i++;
          lastchar = c;
       }
       else{
          if (isspace(c)|| (c=='/')){
               if (lastchar != '\0'){
                   newword[i] = '\0';
                   lastchar = '\0'; 
                   i=0;
                   return(1);
               }
          }
          else{
             if (c=='>'){
                 newword[i] = c;
                 i++; newword[i] = '\0';
                 i=0; lastchar = '\0';
                 return(1);
             }
             else{
                 if (c=='<'){
                     if (lastchar == '\0'){
                        newword[i] = c;
                        i++; 
                     }
                     else{
                         newword[i] = '\0';
                         i=0; 
                         lastchar = '\0';
                         ungetc(c, fptext);
                         return(1);
                     }
                 }
                 else{
                     if (c=='&'){
                        if (lastchar == '\0'){
                           newword[i] = c;
                           i++; 
                        }
                        else{
                            newword[i] = '\0';
                            i=0; 
                            lastchar = '\0';
                            ungetc(c, fptext);
                            return(1);
                        }
                     }
                     else{
                        if ((c==';')&&(newword[0]=='&')){
                            newword[i] = c;
                            i++; newword[i] = '\0';
                            i=0; lastchar = '\0';
                            return(1);
                        }
                     }
                 }
             }
          }
      }
   }
   return(0);
  }
}


int isfulltag(wrd)
char wrd[MAXWORDLENGTH2];
{
  if ((wrd[0]=='<') && (wrd[strlen(wrd)-1]=='>')){
       return(1);
  }
  else{
       return(0);
  }
}


int isendtag(wrd)
char wrd[MAXWORDLENGTH2];
{
  if ((wrd[0]!='<') && (wrd[strlen(wrd)-1]=='>')){
       return(1);
  }
  else{
       return(0);
  }
}


int isbegintag(wrd)
char wrd[MAXWORDLENGTH2];
{
  if ((wrd[0]=='<') && (wrd[strlen(wrd)-1]!='>')){
       return(1);
  }
  else{
       return(0);
  }
}


int istag(wrd)
char wrd[MAXWORDLENGTH2];
{
  if (wrd[0]=='<'){
         return(1);
  }
  else{
    if (wrd[strlen(wrd)-1]=='>') {
         return(1);
    }
    else{
       if ((wrd[0]=='&')&&(wrd[1]!='\0')) return(1);
    }
  }

  return(0);
}


void insertintotypelist(wrd)   /* binary search */
char wrd[MAXWORDLENGTH2];
{
 int found, begin, end, mid, cmp, interval, addition;
 extern void insert();

 found = 0; begin = 1; end = V; 

 while (found == 0){
    if (end > begin+1){
        interval = end - (begin - 1);
        addition = interval/2;
        mid = begin+addition;
        cmp = strcmp(wrd, WORDTYPES[mid]);
        if (cmp > 0){
            begin=mid;
        }
        else{
            if (cmp < 0){
                end = mid;
            }
            else{ /* found */
                WORDFREQ[mid]++;
                found = 1;
                checkthea(WORDTYPES[mid]);
            }
        }
    }
    else{
        cmp = strcmp(wrd, WORDTYPES[begin]);
        if (cmp < 0){
            insert(wrd, begin);
            found = 1;
        }
        else{
            if (cmp == 0){
               WORDFREQ[begin]++;
               found = 1;
               checkthea(WORDTYPES[begin]);
            }
            else{
               cmp = strcmp(wrd, WORDTYPES[end]);
               if (cmp < 0){
                   insert(wrd,end);
                   found=1;
               }
               else{
                   if (cmp==0){
                        WORDFREQ[end]++;
                        found=1;
                        checkthea(WORDTYPES[end]);
                   }
                   else{
                        insert(wrd, end+1);
                        found = 1;
                   }
               }
            }
        }
    }
 }   
}


void insert(woord, pos)
char woord[MAXWORDLENGTH2];
int pos;
{
 int ii;
 for (ii = V+1; ii > pos; ii--){
     strcpy(WORDTYPES[ii], WORDTYPES[ii-1]);
     WORDFREQ[ii] = WORDFREQ[ii-1];
 }
 strcpy(WORDTYPES[pos], woord);
 WORDFREQ[pos] = 1;

 checkthea(woord);

 V++;
}


void showtypelist(wrd, v)
char wrd[MAXWORDLENGTH2];
int v;
{
 int iii;
 for (iii = 1; iii <= v; iii++){
     printf("%-20s %10d      [%s]\n", WORDTYPES[iii], WORDFREQ[iii], wrd);
 }
 printf("-------------------\n");
}


void help ()
{
  fprintf (stderr,"spectrum -kK -e -s text.txt\n");
  fprintf (stderr,"OPTIONS:\n");
  fprintf (stderr,"      -k: K text chunks   (default: K = 20)\n");
  fprintf (stderr,"      -e: input file does not have .txt extension\n");
  fprintf (stderr,"      -s: don't print text.zvec and text.zrank\n");
  fprintf (stderr,"      -n: don't attempt to remove Sgml code\n");
  fprintf (stderr,"      -p: don't attempt to remove punctuation marks\n");
  fprintf (stderr,"INPUT:\n");
  fprintf (stderr,"      text.txt:  a text file (ASCII)\n");
  fprintf (stderr,"OUTPUT:\n");
  fprintf (stderr,"      text.obs: empirical developmental profile\n");
  fprintf (stderr,"      text.zvc: text in vector format with Zipf ranks\n");
  fprintf (stderr,"      text.wfl: word frequency list\n");
  fprintf (stderr,"      text.spc: frequency spectrum\n");
  fprintf (stderr,"      text.zrk: rank-frequency list\n");
  fprintf (stderr,"      text.sum: summary statistics\n");
  exit (1);
}

/* =====================================================================*/


void SichelParam ()    /* estimates parameters for Gamma fixed at -0.5 */
{
    double	upper, lower, stepsize, x, y, z, theta,alpha;
    int		slecht, iterations;

    upper = ((double)curN)/((double)curV1);
    lower = NULL_F;
    stepsize = ((double)curN/(double)curV1)/1000.0;
    x = upper;
    slecht = 0;
    iterations = 0;

    while (delta (x, (double) curN, (double) curV, (double) curV1) > EPSILON) {
        iterations++;
        upper = x;
        x -= stepsize;
        if (x > NULL_F) {
           if (delta (x, (double) curN, (double) curV, (double) curV1) < NULL_F) {
               x = upper;
               stepsize /= 1000.0;
           }
        }
        if (fz (x, (double) curN, (double) curV, (double) curV1) < NULL_F) {
             x = upper;
             slecht = 1;
             break;
        }
        if (iterations > MAXITERATIONS){
             x = upper;
             slecht = 1;
             break;
        }
    }
    if (x < MAXX) { 
      z = x;
      y = ((double)curV1/(double)curN)*z;
      theta = (1.0 - (y*y));
      alpha = ((2.0 * curV1)/(1.0*curV*theta))*
               (((1.0*curN*y)/((double) curV1))-1.0);

      cur_c = theta/((double)curN*(1.0-theta));
      cur_b = alpha/sqrt(1.0 + (cur_c * (double)curN));
      cur_a1 = (cur_b*(double)curN)/
               ((2.0/cur_c) * sqrt(1.0 + ((double)curN * cur_c)) *
               (exp( cur_b * (sqrt(1.0 + ((double)curN * cur_c))-1.0))-1.0));
    }
    else {
      slecht = 1;
    }

    if (slecht) {
      cur_c = 0;
      cur_b = 0;
      cur_a1 = 0;
      sichelzeros++;
    }
}

void print_tabel (bestand, tabel)
FILE *bestand;
double tabel[NMEASURES+1][MAXCHUNKS2+1];
{
   int i, j;
   
   fprintf(bestand, "N K D V V1 V2 V3 V4 V5 R W S H C E lM lSt b c a1 Z fa fthe sLmean sLstdev\n");
   for (j = 1; j <= nchunks; j++){
         fprintf(bestand, "%d", CHUNKS[j]);
         for (i = 1; i <= NMEASURES; i++){
               fprintf(bestand, " %10.7f", tabel[i][j]);
         }
         fprintf(bestand, "\n");
   }
}


void constants (j)
int j;

{
      curN = CHUNKS[j];

      /* calculate constants */

      tabel_orig[1][j] = getK (maantal, VM, MLIJST, curN);
      tabel_orig[2][j] = getD (maantal, VM, MLIJST, curN);
      tabel_orig[3][j] = (double) curV;
      tabel_orig[4][j] = (double) curV1;
      tabel_orig[5][j] = (double) curV2;
      tabel_orig[6][j] = (double) curV3;
      tabel_orig[7][j] = (double) curV4;
      tabel_orig[8][j] = (double) curV5;
      tabel_orig[9][j] = ((double) curV)/sqrt(curN);
      tabel_orig[10][j] = exp(exp(-0.17*log(curV))*log(curN));
      tabel_orig[11][j] = ((double) curV2)/((double) curV);
      tabel_orig[12][j] = 100.0 * log((double)curN)/
                      (1.0 - ((double)curV1/(double)curV));
      tabel_orig[13][j] = log (curV) / log (curN);
      tabel_orig[14][j] = getE (maantal, VM, MLIJST, curN);
      tabel_orig[15][j] = getlogMean (maantal, VM, MLIJST, curV);
      tabel_orig[16][j] = getlogStdDev (tabel_orig[15][j], maantal, VM, MLIJST, curV);

      /* calculates the parameters for Sichel */
      SichelParam ();  
      tabel_orig[17][j] = cur_b;
      tabel_orig[18][j] = cur_c;
      tabel_orig[19][j] = cur_a1;
      tabel_orig[20][j] = getZ ((double) mMax / (double) curN, curN, curV, &zipfzeros);
      tabel_orig[21][j] = (double) freq_a;
      tabel_orig[22][j] = (double) freq_the;

      /* calculate parameters of Carroll's model, rough sample estimates */
      lognormal (maantal, PVM, VM, MLIJST, curN, &smean, &sstdev);
      tabel_orig[23][j] = smean;
      tabel_orig[24][j] = sstdev;
}


void updatespectrum(eind)    /* build spectrum from wflist */
int eind;               /* first, last position to take into account */
{
  int i; 
  
  curV = eind;          /* current number of types */
  
  MLIJST[1] = WORDFREQ[1];
  VM[1] = 1;
  maantal = 1;
  for (i = 2; i <= curV; i++){   
       insertintoMLIJST(WORDFREQ[i], i);
  }

  /* this cumbersome loop allows for the possibility of gaps early on
     in the spectrum */
  curV1 = 0; curV2 = 0; curV3 = 0; curV4 = 0; curV5 = 0;
  for (i = 1; i <= 5; i++){
    if (MLIJST[i]==1) curV1 = VM[i];  /* current number of hapaxes */
    if (MLIJST[i]==2) curV2 = VM[i];  /* current number of dislegomena */
    if (MLIJST[i]==3) curV3 = VM[i];  
    if (MLIJST[i]==4) curV4 = VM[i]; 
    if (MLIJST[i]==5) curV5 = VM[i];
  }
  mMax = MLIJST[maantal];
}


void showspectrum(bestand)
FILE *bestand;
{
 int i;
 fprintf(bestand, "m Vm\n");
 for (i = 1; i <= maantal; i++){
     fprintf(bestand, "%d %d\n", MLIJST[i], VM[i]);
 }
}


void insertintoMLIJST(frq, tracepos)   /* binary search */
int frq, tracepos;
{
 int found, begin, end, mid, interval, addition;

 found = 0; begin = 1; end = maantal; mid = 0;

 while (found == 0){
    if (end > begin+1){
        interval = end - (begin - 1);
        addition = interval/2;
        mid = begin+addition;
        if (frq > MLIJST[mid]){
            begin=mid;
        }
        else{
            if (frq < MLIJST[mid]){
                end = mid;
            }
            else{ /* found */
                VM[mid]++;
                found = 1;
            }
        }
    }
    else{
        if (frq < MLIJST[begin]){
            insertM(frq, begin, tracepos);
            found = 1;
        }
        else{
            if (frq == MLIJST[begin]){
               VM[begin]++;
               found = 1;
            }
            else{
               if (frq < MLIJST[end]){
                   insertM(frq,end, tracepos);
                   found=1;
               }
               else{
                   if (frq==MLIJST[end]){
                        VM[end]++;
                        found=1;
                   }
                   else{
                        insertM(frq, end+1, tracepos);
                        found = 1;
                   }
               }
            }
        }
    }
 }   
}


void insertM(freq, pos, tracepos)
int freq, pos, tracepos;
{
 int ii;

 for (ii = maantal+1; ii > pos; ii--){
     MLIJST[ii] = MLIJST[ii-1];
     VM[ii] = VM[ii-1];
 }

 MLIJST[pos] = freq;
 VM[pos] = 1;

 maantal++;
}
