###############################################################################
# Answers to exercises in Chapter 5: Linear Models
###############################################################################

library(ALL); data(ALL)

# 1. Analysis of gene expressions of B-cell ALL patients.

#   (a)
ALLB <- ALL[,ALL$BT %in% c("B","B1","B2","B3","B4")]

#   (b)
table(ALLB$BT)

#   (c)
shapiro.pValues <- apply(exprs(ALLB), 1, function(x) shapiro.test(residuals(lm(x ~ ALLB$BT)))$p.value)
head(shapiro.pValues)

#   (d)
library(lmtest)
bp.pValues <-apply(exprs(ALLB), 1, function(x) as.numeric(bptest(lm(x ~ ALLB$BT),studentize = FALSE)$p.value))
head(bp.pValues)

#   (e)
sum(shapiro.pValues > 0.05)
sum(bp.pValues > 0.05)
sum(shapiro.pValues > 0.05 & bp.pValues > 0.05)

# 2. Further analysis of gene expressions of B-cell ALL patients.

#   (a)
anova.pValues <- apply(exprs(ALLB), 1, function(x) anova(lm(x ~ ALLB$BT))$Pr[1])
head(anova.pValues)

#   (b)
featureNames(ALLB)[anova.pValues<0.000001]

#   (c)
kruskal.pValues <- apply(exprs(ALLB), 1, function(x) kruskal.test(x ~ ALLB$BT)$p.value)
head(kruskal.pValues)

#   (d)
featureNames(ALLB)[kruskal.pValues<0.000001]

#   (e)
anova.pValues.001 <- anova.pValues < 0.001
kruskal.pValues.001 <- kruskal.pValues < 0.001
table(anova.pValues.001,kruskal.pValues.001)
#   There are 124 significant gene expressions from ANOVA which are not
#   significant on Kruskal-Wallis. There are only 38 significant gene ex-
#   pressions from Kruskal-Wallis which are non-significant according to
#   ANOVA. The tests agree on the majority of gene expressions.

# 3. Finding the ten best best genes among gene expressions of B-cell ALL
#   patients.

#   (a)
best.anova.pValues = sort(anova.pValues)[1:10]
best.anova.pValues

#   (b)
best.kruskal.pValues = sort(kruskal.pValues)[1:10]
best.kruskal.pValues

#   (c)
anova.pValues.names <- names(best.anova.pValues)
kruskal.pValues.names <- names(best.kruskal.pValues)
intersect(anova.pValues.names,kruskal.pValues.names)

# 4. A simulation study for ANOVA.

#   (a)
x <- matrix(rnorm(90000),nrow = 10000, ncol = 9)

#   (b)
factor <- gl(3,3)

#   (c)
anova.pValues <- apply(x, 1, function(x) anova(lm(x ~ factor))$Pr[1])
sum(anova.pValues<0.05)

#   (d)
#   The number of false positives is 514. The expected number is ff ¢ n =
#   0:05 ¢ 10, 000 = 500, which is quite close to the observed.
#   A matrix with differences between three groups of gene expression val-
#   ues.

#   (e)
sigma <- 1; n <- 10000
data <- cbind(matrix(rnorm(n*3,0,sigma),ncol=3),
matrix(rnorm(n*3,1,sigma), ncol = 3),matrix(rnorm(n*3,2,sigma), ncol = 3))
factor <- gl(3,3)
anova.pValues <- apply(data, 1, function(x) anova(lm(x ~ factor))$Pr[1])
sum(anova.pValues<0.05)
kruskal.pValues <- apply(data, 1, function(x) kruskal.test(x ~ factor)$p.value)
sum(kruskal.pValues<0.05)
#   Thus the number of true positives from ANOVA is 3757 and the num-
#   ber of false negatives is 6243. For the Kruskal-Wallis test there are
#   1143 true positives and 8857 false negatives. This can be impoved by
#   increasing the number of gene expressions per group.