############################################################################### # Answers to exercises in chapter 4: Estimation and Inference ############################################################################### library(multtest); data(golub) library(ape) # 1. Gene CD33. Use grep("CD33",golub.gnames[,2], ignore.case = TRUE) to find index 808. ccnd3 = grep("CCND3",golub.gnames[ ,2], ignore.case = TRUE) ccnd3 # (a) The code golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) shapiro.test(golub[ccnd3,golubFactor=="ALL"]) # gives p-value = 0.592 and changing ALL into AML gives p-value # = 0.2583. Hence, the null hypothesis of normality is not rejected. # (b) var.test(golub[ccnd3,] ~ golubFactor) gives p-value = 0.1095 so the null hypothesis # of equality of variances is not rejected. var.test(golub[ccnd3,] ~ golubFactor) # (c) t.test(golub[ccnd3,] ~ golubFactor, var.equal = TRUE) gives p-value = 1.773e-09, # so the null hypothesis of equality of means is rejected. t.test(golub[ccnd3,] ~ golubFactor, var.equal = TRUE) # (d) Yes, t = -7.9813 is quite extreme given the p-value of 1.773e-09. # 2. Gene MYBL2 (V-myb avian myeloblastosis viral oncogene homolog-like 2) mybl2 = grep("MYBL2",golub.gnames[ ,2], ignore.case = TRUE) mybl2 # (a) Use boxplot(golub[mybl2,] ~ golubFactor) to observe from the box- # plot that one is quite certain that the null hypothesis of no exper- # imental effect holds. boxplot(golub[mybl2,] ~ golubFactor) # (b) t.test(golub[mybl2,] ~ golubFactor, var.equal = TRUE) gives p-value = 0.8597, # so that the null hypothesis of equal means is not rejected. t.test(golub[mybl2,] ~ golubFactor, var.equal = TRUE) # 3. Gene HOXA9 hoxa9 = grep("HOXA9",golub.gnames[ ,2], ignore.case = TRUE) hoxa9 # (a) shapiro.test(golub[hoxa9[1],golubFactor=="ALL"]) gives p-value = 1.318e-07, # so the null hypothesis of normality is rejected. shapiro.test(golub[hoxa9[1],golubFactor=="ALL"]) # (b) wilcox.test(golub[hoxa9[1],] ~ golubFactor) gives p-value = 7.923e-05, # so the null hypothesis of equality of means is rejected. Note that the p-value from # Grubbs test of the ALL expression values is 0.00519, so the null # hypothesis of no outliers is rejected. Nevertheless the Welch two- # sample T-test also rejects the null hypothesis of equal means. # Its t-value equals -4.3026 and is quite large. wilcox.test(golub[hoxa9[1],] ~ golubFactor) # 4. Zyxin. # (a) Searching NCBI UniGene on zyxin gives BC002323.2. # (b) Use chisq.test(as.data.frame(table(read.GenBank(c("BC002323.2"))))$Freq) # to find p-value < 2.2e-16, so that the null hypothesis of equal # frequencies is rejected. chisq.test(as.data.frame(BiocGenerics::table(read.GenBank(c("BC002323.2"))))$Freq) # (c) We download and store the frequencies of the sequences in x and # y. Next, the empirical probabilities from x are used to predict the # frequencies from y. y <- as.data.frame(table(read.GenBank(c("X94991.1"))))$Freq x <- as.data.frame(table(read.GenBank(c("BC002323.2"))))$Freq chisq.test(y, p=x/sum(x)) # 5. Gene selection. tTest.pValues <- apply(golub, 1, function(x) t.test(x ~ golubFactor, alternative = c("greater"))$p.value) golub.gnames[order(tTest.pValues)[1:10],2] # 6. Antigens. library(multtest); data(golub) golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) tTest.pValues <- apply(golub, 1, function(x) { t.test(x ~ golubFactor)$p.value} ) antigens <-grep("antigen", golub.gnames[,2], ignore.case = TRUE) golub.antigens<-golub[antigens,] tTest.pValues.antigens<-tTest.pValues[antigens] golub.gnames.antigens<-golub.gnames[antigens,] golub.gnames.antigens[order(tTest.pValues.antigens)[1:length(antigens)],2] # 7. Genetic Model. From the output below, the null hypothesis that the # probabilities are as specified is not rejected. chisq.test(x=c(930,330,290,90),p=c(9/16,3/16,3/16,1/16)) # 8. Comparing two genes. all66 <- golub[66,golubFactor=="ALL"] all790 <- golub[790,golubFactor=="ALL"] boxplot(all66,all790) mean(all66); mean(all790) median(all66); median(all790) sd(all66); sd(all790) IQR(all66)/1.349 ; IQR(all790)/1.349 # notice the scaling by 1.349! mean(all66); mean(all790) mad(all66); mad(all790) shapiro.test(all66); shapiro.test(all790) # 9. Normality tests. library(multtest); data(golub) golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) all.shapiro.pValues <- apply(golub[,golubFactor=="ALL"], 1, function(x) shapiro.test(x)$p.value) aml.shapiro.pValues <- apply(golub[,golubFactor=="AML"], 1, function(x) shapiro.test(x)$p.value) 100 * sum(all.shapiro.pValues>0.05)/length(all.shapiro.pValues) 100 * sum(aml.shapiro.pValues>0.05)/length(aml.shapiro.pValues) 100 * sum(all.shapiro.pValues>0.05 & aml.shapiro.pValues>0.05)/(length(all.shapiro.pValues)+length(aml.shapiro.pValues)) # 10. Two-sample tests on gene expression values of the Golub et al. (1999) data. # (a) data(golub, package = "multtest") golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) tTest.pValues <- apply(golub, 1, function(x) t.test(x ~ golubFactor)$p.value) o <- order(tTest.pValues,decreasing=FALSE) golub.gnames[o[1:10],2] # (b) wilcoxon.pValues <- apply(golub, 1, function(x) wilcox.test(x ~ golubFactor)$p.value) o <- order(wilcoxon.pValues,decreasing=FALSE) golub.gnames[o[1:10],2] # 11. Biological hypotheses. # (a) n = 1000 p = 0.05 np = n*p # (b) pbinom(9,1000,.05) # (c) sum(dbinom(6:1000,1000,.05)) # (d) sum(dbinom(2:8,1000,.05)) # 12. Programming some tests. # (a) data(golub,package="multtest") golubFactor <- factor(golub.cl,levels=0.1, labels= c("ALL","AML")) ccnd3 = grep("CCND3",golub.gnames[ ,2], ignore.case = TRUE) ccnd3 x <- golub[ccnd3,golubFactor=="ALL"] n <- length(x) y <- golub[ccnd3,golubFactor=="AML"] m <- length(y) t <- (mean(x)-mean(y))/sqrt(var(x)/n + var(y)/m) v <- (var(x)/n + var(y)/m)^2/( (var(x)/n)^2/(n-1) + (var(y)/m)^2/(m-1)) 2*pt(-abs(t),v) mean(x) - mean(y) + qt(0.025,v)* sqrt(var(x)/n + var(y)/m) mean(x) - mean(y) + qt(0.975,v)* sqrt(var(x)/n + var(y)/m) # (b) z <- golub[ccnd3,] sum(rank(z)[1:27]) - 0.5*27*(27+1) # (c) x <- golub[ccnd3,golubFactor=="ALL"] y <- golub[ccnd3,golubFactor=="AML"] w <- 0 for (i in 1:27) { w <- w + sum(x[i]>y) } w