#################################################
#################################################
#								#
#                CHAPTER 7: 				#
#    Non-Parametric Approaches to Model		#
#     Prevalence and Force of Infection		#
#								#
# last update: 25/08/2012				#
#################################################
#################################################

setwd("c:/book/chapter7")

##################################################
## Section 7.1: Non-parameric Approaches         #
##								 #
##   7.1.1  The first non-parametric approaches ##
##################################################

#### HEP A data (restructed in bins from the original data)
############################################################
hepa1<- list(
AGE = c(0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 
	17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
	35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
	53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
	71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85),
POS = c(4,
	11, 6, 20, 28, 55, 70, 38, 37, 31, 35, 27, 29, 25, 25, 26, 16, 18, 16, 
	16, 31, 34, 31, 36, 35, 25, 36, 46, 36, 31, 30, 22, 27, 29, 30, 29, 23,
	17, 20, 20, 20, 22, 26, 18, 13, 18, 21, 21, 18, 20, 18, 13, 12, 8, 7, 
	14, 7, 11, 8, 8, 8, 4, 4, 6, 4, 4, 7, 4, 2, 3, 3, 4, 5, 2, 3, 3, 0, 1, 
	4, 2, 2, 4, 1, 1, 1, 1),
NEG = c(0, 0, 0, 2, 2, 1, 3, 5, 2, 3, 1, 1, 1,
	2, 2, 1, 2, 5, 4, 0, 4, 10, 11, 9, 9, 8, 10, 19, 8, 12, 10, 20, 15, 20,
	23, 23, 32, 29, 24, 29, 37, 26, 38, 42, 42, 37, 34, 52, 39, 44, 44, 38,
	32, 47, 64, 57, 64, 62, 40, 47, 33, 26, 16, 23, 20, 24, 15, 24, 22, 17,
	19, 20, 7, 18, 12, 10, 15, 12, 9, 14, 10, 12, 9, 6, 8, 5),
NTOT = c(4, 
	11, 6, 22, 30, 56, 73, 43, 39, 34, 36, 28, 30, 27, 27, 27, 18, 23, 20, 
	16, 35, 44, 42, 45, 44, 33, 46, 65, 44, 43, 40, 42, 42, 49, 53, 52, 55,
	46, 44, 49, 57, 48, 64, 60, 55, 55, 55, 73, 57, 64, 62, 51, 44, 55, 71,
	71, 71, 73, 48, 55, 41, 30, 20, 29, 24, 28, 22, 28, 24, 20, 22, 24, 12,
	20, 15, 13, 15, 13, 13, 16, 12, 16, 10, 7, 9, 6))
	
hepa1<-data.frame(hepa1$AGE,hepa1$NEG,hepa1$POS,hepa1$POS+hepa1$NEG)
names(hepa1)<-c("AGE","POS","NEG","NTOT")

## Some functions
###################

pavit<- function(datai)
{
	pai1 <- pai2 <- datai
	N <- length(pai1)
	for(i in 1:(N - 1)) {
		if(pai2[i] > pai2[i + 1]) {
			pool <- (pai1[i] + pai1[i + 1])/2
			pai2[i:(i + 1)] <- pool
			k <- i + 1
			for(j in (k - 1):1) {
				if(pai2[j] > pai2[k]) {
				  pool.2 <- sum(pai1[j:k])/length(pai1[j:k])
				  pai2[j:k] <- pool.2
				}
			}
		}
	}
	return(list(pai1, pai2))
}



pavit.w<- function(datai2)
{
	gi<- datai2$POS/datai2$NTOT
	pai1 <- pai2 <- gi
	N <- length(pai1)
	ni<-datai2$NTOT
	for(i in 1:(N - 1)) {
		if(pai2[i] > pai2[i + 1]) {
			pool <- (ni[i]*pai1[i] + ni[i+1]*pai1[i + 1])/(ni[i]+ni[i+1])
			pai2[i:(i + 1)] <- pool
			k <- i + 1
			for(j in (k - 1):1) {
				if(pai2[j] > pai2[k]) {
				  pool.2 <- sum(ni[j:k]*pai1[j:k])/(sum(ni[j:k]))
				  pai2[j:k] <- pool.2
				}
			}
		}
	}
	return(list(pai1, pai2))
}


pava <- function (x, wt=rep(1,length(x)))
#  Compute the isotonic regression of numeric vector 'x', with
#  weights 'wt', with respect to simple order.  The pool-adjacent-
#  violators algorithm is used.  Returns a vector of the same length
#  as 'x' containing the regression.
#  02 Sep 1994 / R.F. Raubertas
{
   n <- length(x)
   if (n <= 1) return (x)
   if (any(is.na(x)) || any(is.na(wt))) {
      stop ("Missing values in 'x' or 'wt' not allowed")
   }
   lvlsets <- (1:n)
   repeat {
      viol <- (as.vector(diff(x)) < 0)  # Find adjacent violators
      if (!(any(viol))) break
      i <- min( (1:(n-1))[viol])        # Pool first pair of violators
      lvl1 <- lvlsets[i]
      lvl2 <- lvlsets[i+1]
      ilvl <- (lvlsets == lvl1 | lvlsets == lvl2)
      x[ilvl] <- sum(x[ilvl]*wt[ilvl]) / sum(wt[ilvl])
      lvlsets[ilvl] <- lvl1
   }
  return(x)
}


## Isotonic regression hepatitis A in BE ##
###########################################

xi<-hepa1$AGE
wi<-hepa1$NTOT
yi<-hepa1$POS/hepa1$NTOT


#### pavit (weighted and unweighted)

iso1<-pava(yi,wt=wi)  # weighted pava
ir <- isoreg(yi~xi)   # unweighted isoreg

j<- hepa1$AGE
gi<- hepa1$POS/hepa1$NTOT
pai<- gi
xx<- pavit.w(hepa1)
xx1<- pavit(pai)

####  FIGURE 7.1
windows(record=TRUE, width=5, height=5)
par(las=1,cex.axis=1.1,cex.lab=1.1,lwd=3,mgp=c(2, 0.5, 0),mar=c(4.1,4.1,4.1,3))

plot(hepa1$AGE,gi,cex=0.05*wi,ylim=c(0,1),xlab="age",ylab="seroprevalnce",lwd=1)
#lines(hepa1$AGE,xx[[2]],type="s")
lines(hepa1$AGE,xx1[[2]],type="s",lty=1)
#lines(xi,iso1,type="s")

ageii<-hepa1$AGE
haz1<- c(0,diff(xx1[[2]]))/(1-xx1[[2]])
fit.haz<- ksmooth(ageii,haz1,ker="normal",bandwidth=15,n.points=86)
fit.haz2<- ksmooth(ageii,haz1,ker="normal",bandwidth=20,n.points=86)
lines(fit.haz$x,fit.haz$y*4,lty=1) # kernel 
lines(fit.haz2$x,fit.haz2$y*4,lty=2) # kernel 
axis(side=4,at=c(0.0,0.2,0.4),labels=c(0.0,0.2,0.4)/4)
mtext(side=4,"force of infection", las=3,line=1.6)


##################################################
## Section 7.1: Non-parametric Approaches        #
##								 #
##   7.1.2  Local estimation by polynomials      #
##################################################
rm(list=ls(all=TRUE))

# Some kernel functions
tricube=function(u){
(1-(abs(u))^3)^3
}

tricubes=function(u){
70/81*(1-(abs(u))^3)^3*(abs(u)<=1)
}

epans=function(u){
3/4*(1-u^2)*(abs(u)<=1)
}

gausss=function(u){
dnorm(u)
}

### FIGURE 7.2
windows(record=TRUE, width=5, height=5)
par(las=1,cex.axis=1.1,cex.lab=1.1,lwd=3,mgp=c(2, 0.5, 0),mar=c(4.1,4.1,4.1,3))
grid=seq(-1,1,by=0.01)
plot(grid,3*gausss(3*grid),type="n",xlab="u",ylab="kernel function")
lines(grid,3*gausss(3*grid))
lines(grid,epans(grid),lty=2)
lines(grid,tricubes(grid),lty=3)


### Local estimation by polynomials
library(locfit)

### Reading the parvo data
data<-read.table("c:/book/chapter4/mumpsuk.dat",header=T)
attach(data)
y=pos/ntot
a=age

### FIGURE 7.3
par(las=1,cex.axis=1.1,cex.lab=1.1,lwd=3,mgp=c(2, 0.5, 0),mar=c(4.1,4.1,4.1,3))
plot(age,pos/ntot,cex=0.005*ntot,pch=21,xlab="age",ylab="seroprevalence",ylim=c(0,1))
lpfit<-locfit(y~a,family="binomial")
lpfitd1<-locfit(y~a,deriv="a",family="binomial")
lpfitd2<-locfit(y~a,deriv=c("a","a"),family="binomial")
#cbind(age,fitted(lpfit),fitted(lpfitd1),fitted(lpfitd2))
lines(a,fitted(lpfit),lty=1,lwd=2)

ageg=seq(from=min(age),to=max(age),by=0.1)
a0=5
a0v=5.5
b0=fitted(lpfit)[a0]
b1=fitted(lpfit)[a0]*(1-fitted(lpfit)[a0])*fitted(lpfitd1)[a0]
b2=fitted(lpfit)[a0]*(1-fitted(lpfit)[a0])*((1-2*fitted(lpfit)[a0])*(fitted(lpfitd1)[a0])^2+fitted(lpfitd2)[a0])/2
locpol2ata0=b0+b1*(ageg-a0v)+b2*(ageg-a0v)^2
lines(ageg,locpol2ata0,lwd=2)
points(a0v,fitted(lpfit)[a0],cex=1,pch=24,lwd=1)
points(a0v,fitted(lpfit)[a0],cex=2,pch=24)
abline(v=a0v,lty=2,lwd=1)
h=quantile(abs(age-a0v),probs=0.7)
kernf=tricube((ageg-a0v)/h)/10
lines(ageg[kernf>=0],kernf[kernf>=0],lty=3)


a0=20
a0v=20.5
b0=fitted(lpfit)[a0]
b1=fitted(lpfit)[a0]*(1-fitted(lpfit)[a0])*fitted(lpfitd1)[a0]
b2=fitted(lpfit)[a0]*(1-fitted(lpfit)[a0])*((1-2*fitted(lpfit)[a0])*(fitted(lpfitd1)[a0])^2+fitted(lpfitd2)[a0])/2
locpol2ata0=b0+b1*(ageg-a0v)+b2*(ageg-a0v)^2
lines(ageg,locpol2ata0,lwd=2)
points(a0v,fitted(lpfit)[a0],cex=1,pch=24,lwd=1)
points(a0v,fitted(lpfit)[a0],cex=2,pch=24)
abline(v=a0v,lty=2,lwd=1)
h=quantile(abs(age-a0v),probs=0.7)
kernf=tricube((ageg-a0v)/h)/10
lines(ageg[kernf>=0],kernf[kernf>=0],lty=3)


##################################################
## Section 7.1: Non-parameric Approaches         #
##								 #
##   7.1.3  Application to UK Mumps data         #
##################################################

### default choices: logit link, tricube kernel, degree 2 polynomial, local estimation by using 70% if nearest neighbors
par(mfrow=c(2,2),lwd=2,las=1,cex.axis=1,cex.lab=1.1,mgp=c(2,0.5, 0),mar=c(3.1,3.1,3.1,3))

### FIGURE 7.4 (top left)
plot(age,pos/ntot,cex=0.003*ntot,pch=19,xlab="age",ylab="seroprevalence",ylim=c(-0.1,1))
lpfit<-locfit(y~a,family="binomial")
lines(a,fitted(lpfit),lty=1,lwd=2)
# local fit force of infection based on the derivative (on the logit scale)
lpfitd1<-locfit(y~a,deriv=1,family="binomial")
lpfoi=fitted(lpfitd1)*fitted(lpfit)
lines(a,lpfoi,lty=1,lwd=2)
axis(side=4,at=c(0.0,0.1,0.2,0.3,0.4))
abline(h=0)
mtext(side=4,"force of infection", las=3,line=1.6,cex=0.9)

lpfit<-locfit(y~a,family="binomial",kern="epan")
lines(a,fitted(lpfit),lty=2,lwd=2)
# local fit force of infection based on the derivative (on the logit scale)
lpfitd1<-locfit(y~a,deriv=1,family="binomial",kern="epan")
lpfoi=fitted(lpfitd1)*fitted(lpfit)
lines(a,lpfoi,lty=2,lwd=2)

lpfit<-locfit(y~a,family="binomial",deg=1)
lines(a,fitted(lpfit),lty=3,lwd=2)
# local fit force of infection based on the derivative (on the logit scale)
lpfitd1<-locfit(y~a,deriv=1,family="binomial",deg=1)
lpfoi=fitted(lpfitd1)*fitted(lpfit)
lines(a,lpfoi,lty=3,lwd=2)

### FIGURE 7.4 (top right)
alpha=c(0,100000000000000000000)
plot(age,pos/ntot,cex=0.003*ntot,pch=19,xlab="age",ylab="seroprevalence",ylim=c(-0.1,1))
lpfit<-locfit(y~a,family="binomial",alpha=alpha)
lines(a,fitted(lpfit),lty=1,lwd=2)
# local fit force of infection based on the derivative (on the logit scale)
lpfitd1<-locfit(y~a,deriv=1,family="binomial",alpha=alpha)
lpfoi=fitted(lpfitd1)*fitted(lpfit)
lines(a,lpfoi,lty=1,lwd=2)
axis(side=4,at=c(0.0,0.1,0.2,0.3,0.4))
abline(h=0)

alpha=c(0,5)
lpfit<-locfit(y~a,family="binomial",alpha=alpha)
lines(a,fitted(lpfit),lty=2,lwd=2)
# local fit force of infection based on the derivative (on the logit scale)
lpfitd1<-locfit(y~a,deriv=1,family="binomial",alpha=alpha)
lpfoi=fitted(lpfitd1)*fitted(lpfit)
lines(a,lpfoi,lty=2,lwd=2)
mtext(side=4,"force of infection", las=3,line=1.6,cex=0.9)


## GENERALIZED CROSS VALIDATION FOR SELECTING THE BANDWIDTH (NEAREST NEIGHBOR OR CONSTANT)
###########################################################################################

## Nearest neighbor gcv with all other options put on default value
####################################################################
alpha=seq(0.2,0.8, by=0.05)
# plot with horizontal axis given by alpha
res=cbind(alpha,summary(gcvplot(y~a,family="binomial",alpha=alpha)))
plot(res[,1],res[,3],type="n",xlab="% Neighbors",ylab=" ")
lines(res[,1],res[,3])
mtext(side=2,"GCV",las=3,line=2.4,cex=0.9)
# This indicates 0.5 would be a reasonable choice

## Constant bandwidth gcv with all other options put on default value
######################################################################
alpha=cbind(rep(0,length(seq(5,25, by=1))),seq(5,25, by=1))
#plot(gcvplot(y~a,family="binomial",alpha=alpha))
# plot with horizontal axis given by alpha
res=cbind(alpha[,2],summary(gcvplot(y~a,family="binomial",alpha=alpha)))
plot(res[,1],res[,3],type="n",xlab="Bandwidth",ylab=" ")
lines(res[,1],res[,3])
mtext(side=2,"GCV",las=3,line=3,cex=0.9)
# This indicates 14 would be a reasonable choice

## Overlaid plot of the gcv choices for both type of bandwidths
## all other choices on default
###############################################################
### FIGURE 7.5
windows(record=TRUE, width=5, height=5)
par(las=1,cex.axis=1.1,cex.lab=1.1,lwd=3,mgp=c(2, 0.5, 0),mar=c(4.1,4.1,4.1,3))

plot(age,pos/ntot,cex=0.004*ntot,pch=19,xlab="age",ylab="seroprevalence",ylim=c(-0.1,1))
# nearest neighbor
alpha=0.5
lpfit<-locfit(y~a,family="binomial",alpha=alpha)
lines(a,fitted(lpfit),lty=1,lwd=2)
lpfitd1<-locfit(y~a,deriv=1,family="binomial",alpha=alpha)
lpfoi=fitted(lpfitd1)*fitted(lpfit)
lines(a,lpfoi,lty=1,lwd=2)
axis(side=4,at=c(0.0,0.1,0.2,0.3,0.4))
abline(h=0)
# constant bandwidth
alpha=c(0,14)
lpfit<-locfit(y~a,family="binomial",alpha=alpha)
lines(a,fitted(lpfit),lty=2,lwd=2)
lpfitd1<-locfit(y~a,deriv=1,family="binomial",alpha=alpha)
lpfoi=fitted(lpfitd1)*fitted(lpfit)
lines(a,lpfoi,lty=2,lwd=2)
axis(side=4,at=c(0.0,0.1,0.2,0.3,0.4))
abline(h=0)
mtext(side=4,"force of infection",las=3,line=1.6)

## Conclusion: nearest neighbor local regression outperforms the constant bandwidth version 
## as it clearly better deals with sparse data for higher ages