###############################################################################
###############################################################################
# Chapter 8
###############################################################################
###############################################################################

########################################
# 8.1 Classification of microRNA
########################################

# Example 1. Classification of Micro RNA. MicroRNA are small RNA molecules...

########################################
# 8.2 ROC types of curves
########################################

# Example 1. For the sake of illustration we consider the prediction of ALL...
data(golub, package = "multtest")
golubLabels <- factor(golub.cl,levels=0:1,labels= c(" ALL","not ALL"))
ccnd3 <- grep("CCND3",golub.gnames[,2], ignore.case = TRUE)
golubPredictor <- factor(golub[ccnd3,]>1.27,levels=c("TRUE","FALSE"), labels=c("ALL","notALL"))
table(golubPredictor,golubLabels)

# Example 2. The expression values for gene CCND3 Cyclin D3 from the...
library(ROCR)
golubLabels <- factor(golub.cl,levels=0:1,labels= c("TRUE","FALSE"))
pred <- prediction(golub[ccnd3,], golubLabels)
perf <- performance(pred, "tpr", "fpr" )
plot(perf,
     lwd=4,
     col="magenta")
## dev.copy2eps(device=x11,file="ROCgolub1042.eps")
performance(pred,"auc")

index <- c(1,1)
gdf5 = grep("GDF5",golub.gnames[ ,2], ignore.case = TRUE)
gdf5
data <- c(sort(golub[gdf5,1:27],decreasing = TRUE),sort(golub[gdf5,28:38],decreasing = TRUE))
for (i in 1:38) {
    if (golub.cl[i] == 0) {
        index[i] <- 2 }
    else {
        index[i] <-golub.cl[i]
    }
}
pred <- prediction(data, index)
perf <- performance(pred, "tpr", "fpr" )
plot(perf,
     lwd=4,
     col="magenta")
## dev.copy2eps(device=x11,file="ROCgolub2058.eps")
performance(pred,"auc" )


########################################
# 8.3 Classification trees
########################################

# Example 1. Optimal gene expressions. Suppose microarray expres-...
library(rpart)
library(rpart.plot)
set.seed(123); n<-10 ; sigma <- 0.5
factor <- factor(c(rep(1,n),rep(2,n),rep(3,n)))
levels(factor) <- c("ALL1","ALL2","AML")
geneA <- c(rnorm(10,0,sigma),rnorm(10,2,sigma),rnorm(10,4,sigma))
data <- data.frame(factor,geneA)
rpartFit <- rpart(factor ~ geneA, method="class",data=data)
boxplot(geneA ~ factor, col=c("blue", "red", "darkgreen"))
## dev.copy2eps(device=x11,file="rpartGeneAboxplot.eps")
prp(rpartFit,
    branch.lwd=4, # wide branches
    branch.col="darkgreen",
    extra=101)    # plot label numbers and the percentage of observations
## dev.copy2eps(device=x11,file="rpartGeneATree.eps")

# Example 2. Gene selection. Another situation is where Gene A discrim-...
set.seed(123)
n<-10 ; sigma <- 0.5
factor <- factor(c(rep(1,n),rep(2,n),rep(3,n)))
levels(factor) <- c("ALL1","ALL2","AML")
geneA <- c(rnorm(20,0,sigma),rnorm(10,2,sigma))
geneB <- c(rnorm(10,0,sigma),rnorm(20,2,sigma))
geneC <- c(rnorm(30,1,sigma))
data <- data.frame(factor,geneA,geneB,geneC)
rpartFit <- rpart(factor ~ geneA + geneB + geneC, method="class",data=data)
boxplot(geneA ~ factor, col=c("blue", "blue", "red"))
## dev.copy2eps(device=x11,file="rpartGeneABCboxplot.eps")
prp(rpartFit,
    branch.lwd=4, # wide branches
    branch.col="blue",
    extra=101)    # plot label numbers and the percentage of observations
## dev.copy2eps(device=x11,file="rpartGeneABCTree.eps")

# Example 3. Prediction by CCND3 Cyclin D3 gene expression values....
library(rpart); library(multtest); data(golub)
golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
ccnd3 <- grep("CCND3",golub.gnames[,2], ignore.case = TRUE)
golubRpart <- rpart(golubFactor ~ golub[ccnd3,] , method="class")
predictedclass <- predict(golubRpart, type="class")
table(predictedclass, golubFactor)

summary(golubRpart)
predict(golubRpart, type="class")
predict(golubRpart, type="prob")
boxplot(golub[1042,] ~ golubFactor, col=c("blue", "red"))
## dev.copy2eps(device=x11,file="rpartgolubBoxPlot1042.eps")
prp(golubRpart,
     branch.lwd=4, # wide, thick branches
     branch.col="blue",
     extra=101)    # plot label numbers and the percentage of observations
## dev.copy2eps(device=x11,file="rpartgolub1042.eps")

# Example 4. Gene selection of the Golub (1999) data. By recursive...
library(rpart); library(multtest); data(golub)
row.names(golub)<- paste("gene", 1:3051, sep = "")
golubData <- data.frame(t(golub[1:3051,]))
golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
golubRpart <- rpart(golubFactor~., data=golubData, method="class", cp=0.001)
prp(golubRpart,
     branch.lwd=4, # wide, thick branches
     branch.col="blue",
     extra=101)    # plot label numbers and the percentage of observations
golub.gnames[896,]

# Example 5. Application to the Chiaretti (2004) data. With respect to...
library("hgu95av2.db"); library(ALL); data(ALL)
ALLB123 <- ALL[,ALL$BT %in% c("B1","B2","B3")]
anova.pValue <- apply(exprs(ALLB123), 1, function(x) anova(lm(x ~ ALLB123$BT))$Pr[1])
names <- featureNames(ALL)[anova.pValue<0.00001]
symb <- mget(names, env = hgu95av2SYMBOL)
ALLBTnames <- ALLB123[names, ]
probeData <- as.matrix(exprs(ALLBTnames))
row.names(probeData)<-unlist(symb)

diagnosed <- factor(ALLBTnames$BT)
rpartFit <- rpart(diagnosed ~ ., data = data.frame(t(probeData)))
prp(rpartFit,
    branch.lwd=4,
    branch.col="blue",
    extra=101)
dev.copy2eps(device=x11,file="rpartOnALLB123.eps")
rpartPredict <- predict(rpartFit, type="class")
table(rpartPredict,diagnosed)

predicted.class <- predict(rpartFit, type="class")
predicted.probabilities <- predict(rpartFit, type="prob")
out <- data.frame(predicted.probabilities,predicted.class,
                  diagnosis=factor(ALLBTnames$BT))
print(out,digits=2)

# Example 6. Training and validation. In the setting of B-cell ALL data...
training <- sample(1:78, 39, replace = FALSE)
testing <- setdiff(1:78,training)
df <- data.frame(t(probeData))
rpartFit <- rpart(diagnosed ~ ., data = df, subset=training)

rpart.predictor.t <- predict(rpartFit, df[training,], type="class")
table(rpart.predictor.t,factor(ALLBTnames$BT[training]))

rpart.predictor.v <- predict(rpartFit,df[testing,], type="class")
table(rpart.predictor.v,factor(ALLBTnames$BT[testing]))

########################################
# 8.4 Random Forest
########################################
library(randomForest)
Y <- factor(ALLBTnames$BT);X <- t(probeData)
rf1 <- randomForest(X, Y, ntree = 1000, importance=TRUE, proximity=TRUE)
rf1
plot(rf1, main=NULL, lwd=3)
legend("topright",c("OOB", "B1","B2","B3"),merge=TRUE,col=c(1,2,3,4),lty=c(1,1,1,1), lwd=c(4,4,4,4))

varImpPlot(rf1,
           n.var = 15,
           pch=19,
           main=NULL,
           col="red",
           gcolor="blue",
           lcolor="darkgreen")
dev.copy2eps(device=x11,file="varImpPlotRandomForestOnALLB123.eps")

rft <- randomForest(X[training,], Y[training], ntree = 1000, importance = TRUE, proximity=TRUE)
table(rfpredt=rft$pred,Yt=Y[training])

pv <- predict(rft, newData=X[testing,], type="class", proximity=TRUE)
table(rfpredv=pv$pred,Yv=Y[testing])


########################################
# 8.5 Support Vector Machine
########################################

# Example 1. Application to the Chiaretti (2004) data. The parameters...
library(e1071)
df <- data.frame(Y = factor(ALLBTnames$BT), X =t(probeData))
Y <- factor(ALLBTnames$BT); X <- t(probeData)
Yt <- factor(ALLBTnames$BT)[training]; Yv <- factor(ALLBTnames$BT)[testing]
X <- t(probeData); Xt <- X[training,]; Xv <- X[testing,]
svmEst <- svm(X, Y, data=df, type = "C-classification", kernel = "linear", probability = TRUE)
svmPredict <- predict(svmEst, X, probability=TRUE)
table(svmPredict, Y)
summary(svmEst)
dim(svmEst$SV)
dim(svmEst$coefs)

# Example 2. Training and validation. A generally applied manner to...
Yt <- factor(ALLBTnames$BT)[training]; Yv <- factor(ALLBTnames$BT)[testing]
X <- t(probeData); Xt <- X[training,]; Xv <- X[testing,]
svmEst <- svm(Xt, Yt, type = "C-classification", kernel = "linear", probability = TRUE)
svmPredict.t <- predict(svmEst, Xt, probability=TRUE)
table(svmPredict.t, Yt)
svmPredict.v <- predict(svmEst, Xv, probability=TRUE)
table(svmPredict.v, Yv)

########################################
# 8.6 Neural Networks
########################################

# Example 1. Application to the Chiaretti (2004) data. The models can...
Y <- factor(ALLBTnames$BT); X <- t(probeData)
library(nnet)
df <- data.frame(Y = Y, X = X[, sample(ncol(X), 20)])
nnest <- nnet(Y ~ .,data = df, size = 5, maxit = 500, decay = 0.01, MaxNWts = 5000)
pred <- predict(nnest, type = "class")
table(pred, Y) # prints confusion ma

# Example 2. Training and validation. The results from cross validation...
nnest.t <- nnet(Y ~ ., data = df,subset=training, size = 5,decay = 0.01,maxit=500)
prednnt <- predict(nnest.t, df[training,], type = "class")
table(prednnt,Ytrain=Y[training])
prednnv <- predict(nnest.t, df[testing,], type = "class")
table(prednnv, Yval= Y[testing])

########################################
# 8.7 Generalized Linear Models
########################################

# Example 1. CCND3 Cyclin D3. In the Golub et al. (1999) data we...
library(faraway)
ccnd3 <- grep("CCND3",golub.gnames[,2], ignore.case = TRUE)
logitmod <- glm((-golub.cl + 1) ~ golub[ccnd3,],
                family=binomial(link = "logit"))
pchisq(deviance(logitmod),df.residual(logitmod),lower=FALSE)
plot((-golub.cl + 1) ~ golub[ccnd3,], xlim=c(-2,5), ylim = c(0,1),
     xlab="CCND3 expression values ", ylab="Probability of ALL")
x <- seq(-2,5,.1)
lines(x,ilogit(-4.844124 + 4.439953*x))
pchisq(deviance(logitmod),df.residual(logitmod),lower=FALSE)
summary(logitmod)

predictor <- predict(logitmod,type="response") > 0.5
predictor.fac <- factor(predictor,levels=c(TRUE,FALSE),labels=c("ALL","not ALL"))
table(predictor.fac,golubFactor)

# Example 2. Application to the Chiaretti (2004) data. With respect to...
library(nnet); library("hgu95av2.db"); library(ALL); data(ALL)
probe.names <- c("1389_at","35991_at","40440_at")
ALLB123 <- ALL[,ALL$BT %in% c("B1","B2","B3")]
probeData <- exprs(ALLB123)[probe.names,]
row.names(probeData) <- unlist(mget(probe.names, env = hgu95av2SYMBOL))
factor <- factor(ALLB123$BT,levels=c("B1","B2","B3"))
data <- data.frame(factor,t(probeData))
mnmod <- multinom(factor ~ ., family=binomial(link = "logit"),data=data)
summary(mnmod)

predict.mn <- predict(mnmod,type="class")
table(predict.mn,factor)