###############################################################################
# Answers to exercises in chapter 4: Estimation and Inference
###############################################################################

library(multtest); data(golub)
library(ape)

# 1. Gene CD33. Use grep("CD33",golub.gnames[,2], ignore.case = TRUE) to find index 808.
ccnd3 = grep("CCND3",golub.gnames[ ,2], ignore.case = TRUE)
ccnd3

#   (a) The code
golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
shapiro.test(golub[ccnd3,golubFactor=="ALL"])
#   gives p-value = 0.592 and changing ALL into AML gives p-value
#   = 0.2583. Hence, the null hypothesis of normality is not rejected.

#   (b) var.test(golub[ccnd3,] ~ golubFactor) gives p-value = 0.1095 so the null hypothesis
#   of equality of variances is not rejected.
var.test(golub[ccnd3,] ~ golubFactor)

#   (c) t.test(golub[ccnd3,] ~ golubFactor, var.equal = TRUE) gives p-value = 1.773e-09,
#   so the null hypothesis of equality of means is rejected.
t.test(golub[ccnd3,] ~ golubFactor, var.equal = TRUE)

#   (d) Yes, t = -7.9813 is quite extreme given the p-value of 1.773e-09.

# 2. Gene MYBL2 (V-myb avian myeloblastosis viral oncogene homolog-like 2)

mybl2 = grep("MYBL2",golub.gnames[ ,2], ignore.case = TRUE)
mybl2

#   (a) Use boxplot(golub[mybl2,] ~ golubFactor) to observe from the box-
#   plot that one is quite certain that the null hypothesis of no exper-
#   imental effect holds.
boxplot(golub[mybl2,] ~ golubFactor)

#   (b) t.test(golub[mybl2,] ~ golubFactor, var.equal = TRUE) gives p-value = 0.8597,
#   so that the null hypothesis of equal means is not rejected.
t.test(golub[mybl2,] ~ golubFactor, var.equal = TRUE)

# 3. Gene HOXA9

hoxa9 = grep("HOXA9",golub.gnames[ ,2], ignore.case = TRUE)
hoxa9

#   (a) shapiro.test(golub[hoxa9[1],golubFactor=="ALL"]) gives p-value = 1.318e-07,
#   so the null hypothesis of normality is rejected.
shapiro.test(golub[hoxa9[1],golubFactor=="ALL"])

#   (b) wilcox.test(golub[hoxa9[1],] ~ golubFactor) gives p-value = 7.923e-05,
#   so the null hypothesis of equality of means is rejected. Note that the p-value from
#   Grubbs test of the ALL expression values is 0.00519, so the null
#   hypothesis of no outliers is rejected. Nevertheless the Welch two-
#   sample T-test also rejects the null hypothesis of equal means.
#   Its t-value equals -4.3026 and is quite large.
wilcox.test(golub[hoxa9[1],] ~ golubFactor)

# 4. Zyxin.

#   (a) Searching NCBI UniGene on zyxin gives BC002323.2.

#   (b) Use chisq.test(as.data.frame(table(read.GenBank(c("BC002323.2"))))$Freq)
#   to find p-value < 2.2e-16, so that the null hypothesis of equal
#   frequencies is rejected.
chisq.test(as.data.frame(BiocGenerics::table(read.GenBank(c("BC002323.2"))))$Freq)

#   (c) We download and store the frequencies of the sequences in x and
#   y. Next, the empirical probabilities from x are used to predict the
#   frequencies from y.
y <- as.data.frame(table(read.GenBank(c("X94991.1"))))$Freq
x <- as.data.frame(table(read.GenBank(c("BC002323.2"))))$Freq
chisq.test(y, p=x/sum(x))

# 5. Gene selection.
tTest.pValues <- apply(golub, 1, function(x) t.test(x ~ golubFactor, alternative = c("greater"))$p.value)
golub.gnames[order(tTest.pValues)[1:10],2]

# 6. Antigens.
library(multtest); data(golub)
golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
tTest.pValues <- apply(golub, 1, function(x) { t.test(x ~ golubFactor)$p.value} )
antigens <-grep("antigen", golub.gnames[,2], ignore.case = TRUE)
golub.antigens<-golub[antigens,]
tTest.pValues.antigens<-tTest.pValues[antigens]
golub.gnames.antigens<-golub.gnames[antigens,]
golub.gnames.antigens[order(tTest.pValues.antigens)[1:length(antigens)],2]

# 7. Genetic Model. From the output below, the null hypothesis that the
#   probabilities are as specified is not rejected.
chisq.test(x=c(930,330,290,90),p=c(9/16,3/16,3/16,1/16))

# 8. Comparing two genes.
all66 <- golub[66,golubFactor=="ALL"]
all790 <- golub[790,golubFactor=="ALL"]
boxplot(all66,all790)
mean(all66); mean(all790)
median(all66); median(all790)
sd(all66); sd(all790)
IQR(all66)/1.349 ; IQR(all790)/1.349 # notice the scaling by 1.349!
mean(all66); mean(all790)
mad(all66); mad(all790)
shapiro.test(all66); shapiro.test(all790)

# 9. Normality tests.
library(multtest); data(golub)
golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
all.shapiro.pValues <- apply(golub[,golubFactor=="ALL"], 1, function(x) shapiro.test(x)$p.value)
aml.shapiro.pValues <- apply(golub[,golubFactor=="AML"], 1, function(x) shapiro.test(x)$p.value)
100 * sum(all.shapiro.pValues>0.05)/length(all.shapiro.pValues)
100 * sum(aml.shapiro.pValues>0.05)/length(aml.shapiro.pValues)
100 * sum(all.shapiro.pValues>0.05 & aml.shapiro.pValues>0.05)/(length(all.shapiro.pValues)+length(aml.shapiro.pValues))

# 10. Two-sample tests on gene expression values of the Golub et al. (1999) data.

#   (a)
data(golub, package = "multtest")
golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
tTest.pValues <- apply(golub, 1, function(x) t.test(x ~ golubFactor)$p.value)
o <- order(tTest.pValues,decreasing=FALSE)
golub.gnames[o[1:10],2]

#   (b)
wilcoxon.pValues <- apply(golub, 1, function(x) wilcox.test(x ~ golubFactor)$p.value)
o <- order(wilcoxon.pValues,decreasing=FALSE)
golub.gnames[o[1:10],2]

# 11. Biological hypotheses.

#   (a)
n = 1000
p = 0.05
np = n*p

#   (b)
pbinom(9,1000,.05)

#   (c)
sum(dbinom(6:1000,1000,.05))

#   (d)
sum(dbinom(2:8,1000,.05))

#   12. Programming some tests.

#   (a)
data(golub,package="multtest")
golubFactor <- factor(golub.cl,levels=0.1, labels= c("ALL","AML"))
ccnd3 = grep("CCND3",golub.gnames[ ,2], ignore.case = TRUE)
ccnd3

x <- golub[ccnd3,golubFactor=="ALL"]
n <- length(x)
y <- golub[ccnd3,golubFactor=="AML"]
m <- length(y)
t <- (mean(x)-mean(y))/sqrt(var(x)/n + var(y)/m)
v <- (var(x)/n + var(y)/m)^2/( (var(x)/n)^2/(n-1) + (var(y)/m)^2/(m-1))
2*pt(-abs(t),v)
mean(x) - mean(y) + qt(0.025,v)* sqrt(var(x)/n + var(y)/m)
mean(x) - mean(y) + qt(0.975,v)* sqrt(var(x)/n + var(y)/m)

#   (b)
z <- golub[ccnd3,]
sum(rank(z)[1:27]) - 0.5*27*(27+1)

#   (c)
x <- golub[ccnd3,golubFactor=="ALL"]
y <- golub[ccnd3,golubFactor=="AML"]
w <- 0
for (i in 1:27) {
    w <- w + sum(x[i]>y)
}
w