#####Chapter 3: Maximum Likelihood Estimation

####Section 3.1:  MLE with a Normal Distribution
#Normal probabilities
mu=98
sigma=15
x=75
dnorm(x, mu, sigma)

#Log probabilities
dnorm(x,mu,sigma,log=T)

#Log likelihood functions given specific values for mu and sigma
x=c(75,87,89,90,93,99,100,101,103,112,116,135)
mu=98
sigma=15
like <-prod(dnorm(x, mu, sigma))
like
loglike <-sum(dnorm(x, mu, sigma,log=T))
loglike

#GRID SEARCH METHOD USING LOG LIKELIHOOD
normprob = function (x,mu,sigma) {
  sum(-0.5*(log(2*pi))-0.5*log(sigma^2)-((x-mu)^2)/(2*sigma^2))
} 
X=c(75,87,89,90,93,99,100,101,103,112,116,135)
means = seq(95,105,by=.1)
vars = seq(10,20,by=.5)
mean.times=length(means)
mean.likes=sapply(means,function(y)prod(normprob(x=X,mu=y,sigma=10)))  
var.likes=sapply(vars,function(y)prod(normprob(x=X,mu=95,sigma=y)))   
plot(means,mean.likes,type="b")
windows()
plot(vars,var.likes,type="b")
means=rep(means,each=length(vars))          
vars=rep(vars,mean.times)            
mv.mat=cbind(means,vars)            
mv.likes = apply(mv.mat,1,function(y)prod(normprob(x=X,mu=y[1],sigma=y[2])))
mv.mat[mv.likes==max(mv.likes)]
best.combo<-(1:length(mv.likes))[mv.likes==max(mv.likes)]
best.combo
max.log <-mv.likes[mv.likes==max(mv.likes)]
max.log

#MLE in R 
norm.fit<-function(mu,sigma){ 
-sum(dnorm(x,mu,sigma,log=T)) 
} 
library(bbmle)                        #attach bbmle package
x=c(75,87,89,90,93,99,100,101,103,112,116,135)
mle.results<-mle2(norm.fit,start=list(mu=95,sigma=10),data=list(x))
mle.results

###Section 3.2: Derivatives and Standard Errors
#Define function
funct = expression(3*x^3-4*x^2+5*y^2-6*y, 'x,y')
derivx1 <-D(funct,'x')
derivx1
derivx2 <-D(derivx1,'x')
derivx2
derivy1 <-D(funct,'y')
derivy1
derivy2 <-D(derivy1,'y')
derivy2

#Standard Errors for High and Low Variance Distributions
#Log likelihood function for normal distribution
loglike <- function(theta) {
sum ( 0.5*(x - theta[1])^2/theta[2] + 0.5* log(theta[2]) )
}

#Standard errors for Low Variance distribution
x =c(75,87,89,90,93,99,100,101,103,112,116,135)
lik.est <-nlm(loglike, theta <- c(100,15),hessian=TRUE)
std.err.low <-sqrt(solve(lik.est$hessian[1]))
std.err.low
t.crit <-abs(qt(.025,12))
CI.low <-theta[1]-(t.crit*std.err.low[1])
CI.high <-theta[1]+(t.crit*std.err.low[1])
both.low <-cbind(CI.low,CI.high)
both.low

#Standard errors for High Variance distribution
x =c(65,66,70,71,75,79,120,119,125,131,131,148)
lik.est <-nlm(loglike, theta <- c(100,15),hessian=TRUE)
std.err.high <-sqrt(solve(lik.est$hessian[1]))
std.err.high
CI.low <-theta[1]-(t.crit*std.err.high[1])
CI.high <-theta[1]+(t.crit*std.err.high[1])
both.high <-cbind(CI.low,CI.high)
both.high


###Section 3.4: Fisher's Method of Scoring
######################The following is for R code produced in Table 3.4.3  
#Fisher's Method of Scoring
x=c(1,9,1,5,6,8,2,4,2,8,7,7)
y=c(3,8,2,8,5,9,4,5,2,4,2,6)
N=length(y)
X=cbind(1,x)

#Initialize 
theta <-c(2,3,4)        
res0<-1
tol<-1e-8
norm<-10
iter<-0
change<-1

#Iterate
while (change>tol) {
grad.1=(t(X)%*%y-t(X)%*%X%*%theta[1:2])/theta[3]
grad.2=-N/(2*theta[3])+1/(2*theta[3]^2)*(t(y-X%*%theta[1:2])%*%(y-X%*%theta[1:2]))
g=rbind(grad.1,grad.2)
hess.1 = -(t(X)%*%X)/theta[3]
hess.2 = -N/(2*theta[3]^2)
#Create Information Matrix
p=ncol(X)
n=p+1
I = matrix(0,n,n)
I[1:p,1:p]=hess.1
I[n*n]=hess.2
I=-I
theta =theta+solve(I)%*%g
norm.1=norm(theta, type = "F")
change <-abs(norm-norm.1)
norm=norm.1
iter<-iter+1
}
theta;iter;change;I

#Same Result Using Rs Optimization Function
loglike<-function(theta){-sum(dnorm(y,mean=theta[1]+theta[2]*x,sd=sqrt(theta[3]),log=T))
}
maxmod <-optim( theta <- c(2,3,4), loglike, hessian=T, method = "BFGS")
maxmod
