############################################################################### # Answers to exercises in Chapter 5: Linear Models ############################################################################### library(ALL); data(ALL) # 1. Analysis of gene expressions of B-cell ALL patients. # (a) ALLB <- ALL[,ALL$BT %in% c("B","B1","B2","B3","B4")] # (b) table(ALLB$BT) # (c) shapiro.pValues <- apply(exprs(ALLB), 1, function(x) shapiro.test(residuals(lm(x ~ ALLB$BT)))$p.value) head(shapiro.pValues) # (d) library(lmtest) bp.pValues <-apply(exprs(ALLB), 1, function(x) as.numeric(bptest(lm(x ~ ALLB$BT),studentize = FALSE)$p.value)) head(bp.pValues) # (e) sum(shapiro.pValues > 0.05) sum(bp.pValues > 0.05) sum(shapiro.pValues > 0.05 & bp.pValues > 0.05) # 2. Further analysis of gene expressions of B-cell ALL patients. # (a) anova.pValues <- apply(exprs(ALLB), 1, function(x) anova(lm(x ~ ALLB$BT))$Pr[1]) head(anova.pValues) # (b) featureNames(ALLB)[anova.pValues<0.000001] # (c) kruskal.pValues <- apply(exprs(ALLB), 1, function(x) kruskal.test(x ~ ALLB$BT)$p.value) head(kruskal.pValues) # (d) featureNames(ALLB)[kruskal.pValues<0.000001] # (e) anova.pValues.001 <- anova.pValues < 0.001 kruskal.pValues.001 <- kruskal.pValues < 0.001 table(anova.pValues.001,kruskal.pValues.001) # There are 124 significant gene expressions from ANOVA which are not # significant on Kruskal-Wallis. There are only 38 significant gene ex- # pressions from Kruskal-Wallis which are non-significant according to # ANOVA. The tests agree on the majority of gene expressions. # 3. Finding the ten best best genes among gene expressions of B-cell ALL # patients. # (a) best.anova.pValues = sort(anova.pValues)[1:10] best.anova.pValues # (b) best.kruskal.pValues = sort(kruskal.pValues)[1:10] best.kruskal.pValues # (c) anova.pValues.names <- names(best.anova.pValues) kruskal.pValues.names <- names(best.kruskal.pValues) intersect(anova.pValues.names,kruskal.pValues.names) # 4. A simulation study for ANOVA. # (a) x <- matrix(rnorm(90000),nrow = 10000, ncol = 9) # (b) factor <- gl(3,3) # (c) anova.pValues <- apply(x, 1, function(x) anova(lm(x ~ factor))$Pr[1]) sum(anova.pValues<0.05) # (d) # The number of false positives is 514. The expected number is ff ¢ n = # 0:05 ¢ 10, 000 = 500, which is quite close to the observed. # A matrix with differences between three groups of gene expression val- # ues. # (e) sigma <- 1; n <- 10000 data <- cbind(matrix(rnorm(n*3,0,sigma),ncol=3), matrix(rnorm(n*3,1,sigma), ncol = 3),matrix(rnorm(n*3,2,sigma), ncol = 3)) factor <- gl(3,3) anova.pValues <- apply(data, 1, function(x) anova(lm(x ~ factor))$Pr[1]) sum(anova.pValues<0.05) kruskal.pValues <- apply(data, 1, function(x) kruskal.test(x ~ factor)$p.value) sum(kruskal.pValues<0.05) # Thus the number of true positives from ANOVA is 3757 and the num- # ber of false negatives is 6243. For the Kruskal-Wallis test there are # 1143 true positives and 8857 false negatives. This can be impoved by # increasing the number of gene expressions per group.