proc (xr, yr) = discrete (x, y)
; ----------------------------------------------------------------------------
; Library       xplore
; ----------------------------------------------------------------------------
;  See_also     sort cumsum paf diff
; ----------------------------------------------------------------------------
;  Keywords     discretizing, binning
; -----------------------------------------------------------------
;   Macro       discrete
; ----------------------------------------------------------------------------
;   Description  Reduces a matrix to its distinct rows and 
;                gives the number of replications of each row
;                in the original dataset. An optional
;                second matrix y can be given, the rows of y
;                are summed up accordingly.
; ----------------------------------------------------------------------------
;   Usage        {xr,yr} = discrete(x{,y})
;   Input
;     Parameter   x
;     Definition        n x p matrix, the data matrix to reduce, in 
;                       regression usually the design matrix. The
;                       matrix may be numeric or string, in the latter
;                       case no y is possible.
;     Parameter   y
;     Definition        optional, n x q matrix, in regression 
;                       usually the observations of the dependent 
;                       variable. Not possible for string matrix x.
;   Output
;     Parameter   xr
;     Definition        m x p matrix, reduced data matrix (sorted).
;     Parameter   yr
;     Definition        m x 1 vector or m x (q+1) matrix, contains in the first
;                       column the number of replications. If y was given, sums
;                       of y-rows with same x-row are contained in the other
;                       q columns of r.
; ----------------------------------------------------------------------------
;   Example   library("xplore")
;             n=100
;             b=1|2
;             x=ceil(normal(n,rows(b)))
;             y=x*b + normal(n)
;             ; --------------------------------------
;             ;  data reduction
;             ; --------------------------------------
;             {xr,yr}=discrete(x,y)
;             r =yr[,1]
;             yr=yr[,2]
;             rows(r)
;             ; --------------------------------------
;             ;  descriptive statistics of x
;             ; --------------------------------------
;             meanxr = sum(r.*xr)/sum(r)
;             varxr  = sum(r.*(xr-meanxr)^2)/(sum(r)-1)
;             mean(x)'~meanxr'
;             var(x)'~varxr'
;             ; --------------------------------------
;             ;  linear regression
;             ; --------------------------------------
;             b=inv(x'*x)*x'*y
;             br=inv(xr'*diag(r)*xr)*xr'*yr
;             b~br
; ----------------------------------------------------------------------------
;   Result    Matrices x, y with 100 rows are reduced to a matrix xr 
;             (containing distinct rows of x) and yr (sums of y with same 
;             rows in x). 
;             r gives the number of replications. The mean and variance 
;             of x coincide with the weighted mean and variance of xr.
;             The linear regression of y on x coincides with the weighted 
;             regression of yr on xr.
; ----------------------------------------------------------------------------
;   Author    Thomas Koetter, Marlene Mueller, 970325
; ----------------------------------------------------------------------------
  error(rows(dim(x))>2,"first argument must be vector or matrix")
  error((exist(x)<1)||(exist(x)>2),"first argument must be string or numeric")
;
  n = rows (x)
  p = cols(x)
  q = 0
;
  havexNaN=0
  if (exist(x)==1)
    if (prod(prod(isNumber(x),2))!=1)
      xtmp=replace(x,NaN,0)
      xtmp=replace(xtmp,Inf,0)
      xtmp=replace(xtmp,-Inf,0)
      vNaN =ceil(max(max(xtmp),2)+2)
      vpInf=ceil(max(max(xtmp),2)+1)
      vnInf=floor(min(min(xtmp),2)-1)
      error(vNaN==Inf,"cannot discretize these data")
      error(vnInf==-Inf,"cannot discretize these data")
      x=replace(x,NaN,vNaN)
      x=replace(x,Inf,vpInf)
      x=replace(x,Inf,vnInf)
      havexNaN=1
    endif
  endif
;
  havey= exist(y)>0
  haveyNaN=0
;
  if (havey)
    error(exist(y)!=1,"second argument needs to be numeric")
    error(exist(x)!=1,"no second argument when first argument is string")
    error(rows(dim(y))>2,"second argument must be vector or matrix")
    error(rows(y)!=n,"first and second have different number of lines")
    q=cols(y)
    if (prod(prod(isNumber(y)))!=1)
      y=y~(y==-Inf)~(y==Inf)~(isNaN(y))
      y=replace(y, NaN,0)
      y=replace(y, Inf,0)
      y=replace(y,-Inf,0)
      haveyNaN=1
    endif
  endif
;
  if (havey)
    x = sort (x~y,(1:p))
    y = cumsum(x[,p+1:cols(x)])
    x = x[,1:p]
  else
    x = sort (x,(1:p))
  endif
;
  if (n > 1)
    d = sum(x[2:n] != x[1:n-1],2)>0
    if (sum(d)>0)
      s = 0| paf (1:(n-1), d) |n
      xr = x[1+s[1:rows(s)-1]]
      yr = diff(s)     ;(s[2:ns]-s[1:ns-1])
      if (havey)
        yn = y[n]
        y0 = 0.*matrix(1,cols(y))
        y = y0| paf(y[1:n-1],d)| yn
        yr=yr~diff(y)  ;(y[2:ns]-y[1:ns-1])
        if (haveyNaN)
          yr[,q+2:4*q+1]=(yr[,q+2:4*q+1]>0)
          yr[,2:q+1]=yr[,2:q+1]+replace(yr[,  q+2:2*q+1],1,-Inf)
          yr[,2:q+1]=yr[,2:q+1]+replace(yr[,2*q+2:3*q+1],1, Inf)
          yr[,2:q+1]=yr[,2:q+1]+replace(yr[,3*q+2:4*q+1],1, NaN)
          yr=yr[,1:q+1]
        endif
      endif
    else
      xr = x[1]
      yr = n
      if (havey)
        yr= yr~sum(y)
      endif
    endif
  else
    xr = x
    yr = 1
    if (havey)
      yr= yr~y
    endif
  endif
;
  if (havexNaN)
    xr=replace(xr,vNaN,NaN)
    xr=replace(xr,vpInf,Inf)
    xr=replace(xr,vnInf,-Inf)
  endif
endp



