############################################################################### # Answers to exercises in Chapter 8: Classification Methods. ############################################################################### # 1. Classification tree of Golub data. Use recursive partitioning in rpart library("hgu95av2.db"); library(ALL); data(ALL) # install.packages(c("rpart"),repo="http://cran.r-project.org",dep=TRUE) library(rpart) library(rpart.plot) ALLB123 <- ALL[,ALL$BT %in% c("B1","B2","B3")] anova.pValue <- apply(exprs(ALLB123), 1, function(x) anova(lm(x ~ ALLB123$BT))$Pr[1]) names <- featureNames(ALL)[anova.pValue<0.001] symb <- mget(names, env = hgu95av2SYMBOL) ALLBTnames <- ALLB123[names, ] probeData <- as.matrix(exprs(ALLBTnames)) row.names(probeData)<-unlist(symb) diagnosed <- factor(ALLBTnames$BT) rpartDepth2 <- rpart(diagnosed ~ ., data = data.frame(t(probeData)), maxdepth=2) prp(rpartDepth2, branch.lwd=4, branch.col="blue", extra=101) rpartFit <- rpart(diagnosed ~ ., data = data.frame(t(probeData))) prp(rpartFit, branch.lwd=4, branch.col="blue", extra=101) # 2. Sensitivity versus specificity. # (a) library(multtest);library(ROCR); data(golub) golub.cl.increment <- -golub.cl +1 ccnd3 <- grep("CCND3",golub.gnames[,2], ignore.case = TRUE) ccnd3 pred <- prediction(golub[ccnd3,], golub.cl.increment) perf <- performance(pred, "sens", "spec") plot(perf) # (b) The function is essentially the same. # (c) Use auc as before. # 3. Comparing Classification Methods. library(rpart) predictors <- matrix(rnorm(100*4,0,1),100,4) colnames(predictors) <- letters[1:4] groups <- gl(2,50) simData <- data.frame(groups,predictors) simData.rpart <- rpart(groups ~ a + b + c + d,method="class",data=simData) predicted <- predict(simData.rpart,type="class") table(predicted,groups) prp(simData.rpart, branch.lwd=4, # wide branches branch.col="darkgreen", extra=101) # plot label numbers and the percentage of observations table(predicted,groups) library(e1071) svmest <- svm(predictors, groups, data=df, type = "C-classification", kernel = "linear") svmpred <- predict(svmest, predictors, probability=TRUE) table(svmpred, groups) library(nnet) nnest <- nnet(groups ~ ., data = simData, size = 5, maxit = 500, decay = 0.01, MaxNWts = 5000) pred <- predict(nnest, type = "class") table(pred, groups) # prints confusion matrix # The misclassification rate of rpart, svm, and nnet is, respectively, 21/100, # 44/100, and 15/100. If we increase the number of predictors, then the # misclassification rate decreases. # 4. Prediction of achieved remission. library(ALL); library(hgu95av2.db); library(rpart); data(ALL) ALL.remission <- ALL[,which(pData(ALL)$remission %in% c("CR","REF"))] factor.remission <-factor(pData(ALL.remission)$remission) anova.pValue <- apply(exprs(ALL.remission), 1, function(x) { t.test(x ~ factor.remission)$p.value } ) names <- featureNames(ALL.remission)[anova.pValue<.001] ALL.remission.different <- ALL.remission[names,] data <- data.frame(t(exprs(ALL.remission.different))) all.rpart <- rpart(factor.remission ~., data, method="class", cp=0.001) prp(all.rpart, branch.lwd=4, # wide branches branch.col="darkgreen", extra=101) # plot label numbers and the percentage of observations predicted <- predict(all.rpart,type="class") table(predicted,factor.remission) misclassificationRate = 7/(93+1+6+14) misclassificationRate mget(c("1840_g_at","36769_at","1472_g_at","854_at"), env = hgu95av2GENENAME) # 5. Gene selection by area under the curve. library(ROCR); data(golub, package = "multtest") golubFactorTrueFalse <- factor(golub.cl,levels=0:1,labels= c("TRUE","FALSE")) auc.values <- apply(golub,1, function(x) performance(prediction(x, golubFactorTrueFalse),"auc")@y.values[[1]]) o <- order(auc.values,decreasing=TRUE) golub.gnames[o[1:25],2] # 6. Classification Tree for E. coli. ecoli <- read.table("http://www.grappa.univ-lille3.fr/~torre/Recherche/Datasets/downloads/ecoli/ecoli.data",sep=",",header = TRUE) colnames(ecoli) <- c("SequenceName","mcg","gvh","lip","chg","aac","alm1","alm2","ecclass") ecolisel<- ecoli[which(ecoli$ecclass %in% c("cp","im","pp")),] ecolisel$ecclass <- factor(ecolisel$ecclass, levels=c("cp","im","pp")) library(rpart) rpartFit <- rpart(ecolisel$ecclass ~ mcg + gvh + lip + aac + alm1 + alm2,data=ecolisel) prp(rpartFit, branch.lwd=4, # wide branches branch.col="darkgreen", extra=101) # plot label numbers and the percentage of observations title(main = "Classification Tree for 3 E. coli classes: CP, IM, and PP") predictedclass <- predict(rpartFit, type="class") table(predictedclass,ecolisel$ecclass) #predictors are alm1, gvh and im misclassificationRate = (1+2+7+4)/length(ecolisel$ecclass) misclassificationRate