############################################################################### ############################################################################### # Chapter 4 ############################################################################### ############################################################################### ######################################## # 4.1 Statistical hypothesis testing ######################################## ######################################## # 4.1.1 The Z-test ######################################## # Example 1. To illustrate the Z-test we shall concentrate on the Gdf5 ... data(golub, package = "multtest") golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) sigma <- 0.25; n <- 27; mu0 <- 0 x <- golub[2058,golubFactor=="ALL"] z.value <- sqrt(n)*(mean(x) - mu0)/sigma 2*pnorm(-abs(z.value),0,1) f<-function(x){dnorm(x,0,1)} x1 <- seq(-4,-1.96,0.01) y1 <- dnorm(x1,0,1) x2 <- seq(-1.96,1.96,0.01) y2 <- dnorm(x2,0,1) x3 <- seq(1.96,4,0.01) y3 <- dnorm(x3,0,1) plot(f, # function -4, # begin x-value 4, # end x-value cex.lab=1.5, # make axis labels big xlab="x", ylab="Normal probability density function f(x)") polygon(c(-4,x1,-1.96), c(0,y1,0), col="red") polygon(c(-1.96,x2,1.96), c(0,y2,0), col="lightblue") polygon(c(1.96,x3,4), c(0,y3,0), col="red") arrows(-3,0.15,-3,0.03) text(-3,0.23,"Rejection") text(-3,0.20,"Region") text(-3,0.17,expression(alpha/2)) arrows(3,0.15,3,0.03) text(3,0.23,"Rejection") text(3,0.20,"Region") text(3,0.17,expression(alpha/2)) text(0,0.23,"Acceptance") text(0,0.20,"Region") # Example 2. Using the data from Example 1, the 95% confidence interval... mean(x)+qnorm(c(0.025),0,1)*sigma/sqrt(n) mean(x)+qnorm(c(0.975),0,1)*sigma/sqrt(n) library(TeachingDemos) z.test(x,mu=0,sd=0.25) # Example 3. To develop intuition with respect to confidence intervals... ci.examp(mean.sim =0, sd = 1, n = 25, reps = 100, method = "z", lower.conf=0.025, upper.conf=0.975) ######################################## # 4.1.2 One Sample t-Test ######################################## f<-function(x){dt(x,5)} x1 <- seq(-4,qt(0.025,5),0.01) y1 <- f(x1) x2 <- seq(qt(0.025,5),qt(0.975,5),0.01) y2 <- f(x2) x3 <- seq(qt(0.975,5),4,0.01) y3 <- f(x3) plot(f, # function -4, # begin x-value 4, # end x-value xlab="x", ylab="t-Distribution probability density function f(x)") polygon(c(-4,x1,qt(0.025,5)), c(0,y1,0), col="red") polygon(c(qt(0.025,5),x2,qt(0.975,5)), c(0,y2,0), col="lightblue") polygon(c(qt(0.975,5),x3,4), c(0,y3,0), col="red") arrows(-3,0.15,-3,0.03) text(-3,0.23,"Rejection") text(-3,0.20,"Region") text(-3,0.17,expression(alpha/2)) arrows(3,0.15,3,0.03) text(3,0.23,"Rejection") text(3,0.20,"Region") text(3,0.17,expression(alpha/2)) text(0,0.23,"Acceptance") text(0,0.20,"Region") mtext(expression(t[0.025]),side=1,at=qt(0.025,5), col="red") mtext(expression(t[0.975]),side=1,at=qt(0.975,5), col="red") # Example 1. Let's test H0 : ¹ = 0 against H1 : ¹ 6= 0 for the ALL... x <- golub[2058,golubFactor=="ALL"]; mu0 <- 0; n <- 27 t.value <- sqrt(n)*(mean(x) - mu0)/sd(x) t.value 2 * pt(-0.0010, n-1) c(qt(0.025, n-1), qt(0.975, n-1)) mean(x)+qt(0.025,26)*sd(x)/sqrt(n) t.test(x,mu=0) # Example 2. In Chapter 2 a box-and-whiskers plot revealed that the... ccnd3 <- grep("CCND3",golub.gnames[,2], ignore.case = TRUE) ccnd3 t.test(golub[ccnd3,golubFactor=="ALL"],mu=0, alternative = c("greater")) ######################################## # 4.1.3 Two-sample t-test with unequal variances ######################################## # Example 1. Golub et al. (1999) argue that gene CCND3 Cyclin D3 plays... t.test(golub[ccnd3,] ~ golubFactor, var.equal=FALSE) ######################################## # 4.1.4 two-sample t-test with equal variances ######################################## # Example 1. The null hypothesis for gene CCND3 Cyclin D3 that the... t.test(golub[ccnd3,] ~ golubFactor, var.equal = TRUE) ######################################## # 4.1.5 Two-sample paired t-test ######################################## before = c(112.9, 133.5, 122.8, 145.6, 167.2, 169.2, 174.6, 175.3, 184.4, 121.3) after = c(120.7, 123.6, 134.0, 141.2, 156.8, 170.0, 182.0, 185.9, 196.0, 115.1) t.test(before, after, paired=TRUE) ######################################## # 4.1.6 F-test on equal variances ######################################## # Example 1. The null hypothesis for gene CCND3 Cyclin D3 that the... var.test(golub[ccnd3,] ~ golubFactor) ######################################## # 4.1.7 Binomial test ######################################## # Example 1. A microRNA of length 22 contains 18 purines. The null... 1 - pbinom(17, 22, 0.7) binom.test(18, 22, p = 0.7, alternative = c("greater"), conf.level = 0.95) ######################################## # 4.1.8 Chi-squared test ######################################## # Example 1. Suppose we want to test the hypothesis that the nucleotides... library(ape) zyxinfreq <- table(read.GenBank(c("X94991.1"),as.character=TRUE)) chisq.test(zyxinfreq) f<-function(x){dchisq(x,3)} plot(f, # function 0, # begin x-value 25, # end x-value cex.lab=1.5, # make axis labels big xlab="q", ylab="Chi-squared probability density function f(q)") x1<- seq(0,qchisq(0.95,3),0.01) x2<- seq(qchisq(0.95,3),20,0.01) polygon(c(0,x1,qchisq(0.95,3)), c(0,f(x1),0), col="lightblue") polygon(c(qchisq(0.95,3),x2,20), c(0,f(x2),0), col="red") arrows(10,0.045,10,0.02) text(10,0.06,"Rejection") text(10,0.05,"Region") text(3,0.04,"Acceptance") text(3,0.03,"Region") mtext("7.8",side=1,at=7.86) # Example 2. In the year 1866 Mendel observed in large number of exper-... pi <- c(0.75,0.25) x <-c(5474, 1850) chisq.test(x, p=pi) qchisq(0.95, 3) # Example 3. Given certain expression values for a healthy control group... data <- matrix(c(5,5,5,5),2,byrow=TRUE) chisq.test(data) data <- matrix(c(8,2,2,8),2,byrow=TRUE) chisq.test(data) ######################################## # 4.1.9 Fisher’s exact test ######################################## # Example 1. Example 1: Oncogenes on chromosome 1.... data <- matrix(c(300,500,3000,7000),2,byrow=TRUE) fisher.test(data) ######################################## # 4.1.10 Normality tests ######################################## # Example 1. To test the hypothesis that the ALL gene expression values... shapiro.test(golub[ccnd3, golubFactor=="ALL"]) library(nortest) ad.test(golub[ccnd3,golubFactor=="ALL"]) ######################################## # 4.1.11 Outliers test ######################################## # Example 1. From Figure 2.4 we have observed that expression values... library(outliers) grubbs.test(golub[ccnd3, golubFactor=="ALL"]) ######################################## # 4.1.12 Non-Parametric Tests ######################################## # One-sample sign test # Example 1: Small sample size. # install.packages(BSDA) library(BSDA) x<-c(6003, 6304, 6478, 6245, 6134, 6204, 6150, 6345, 6298, 6194) SIGN.test(x, md = 6000) # One-sample Wilcoxon signed-rank test # Example 1: Small sample size. x <- c(6003, 6304, 6478, 6245, 6134, 6204, 6150) wilcox.test(x,mu=6000) # Two-sample Wilcoxon rank-sum test # Example 1. CCND3 gene expression. The null hypothesis that the expression values for gene... wilcox.test(golub[ccnd3,] ~ golubFactor) # Two-sample paired sign test # Example 1: pathway gene expression. before = c(112.9, 133.5, 122.8, 145.6, 167.2, 169.2, 174.6, 175.3, 184.4, 121.3) after = c(120.7, 123.6, 134.0, 141.2, 156.8, 170.0, 182.0, 185.9, 196.0, 115.1) SIGN.test(before, after, alternative = "two.sided") # Two-sample paired Wilcoxon rank-sum test # Example 1: pathway gene expression. before = c(112.9, 133.5, 122.8, 145.6, 167.2, 169.2, 174.6, 175.3, 184.4, 121.3) after = c(120.7, 123.6, 134.0, 141.2, 156.8, 170.0, 182.0, 185.9, 196.0, 115.1) wilcox.test(before, after, paired=TRUE) # Non-Parametric Bootstrapping # Example 1: Small sample size. Lets use the bootstrap... library(boot) x <- c(6003, 6304, 6478, 6245, 6134) boot.mean <- boot(x, function(x,i){mean(x[i])}, R=9999) boot.mean mean(boot.mean$t) sd(boot.mean$t) boot.ci(boot.mean, conf = c(0.95), type = c("perc")) pvec <- c(0.025,0.975) quantile(boot.mean$t,pvec) ######################################## # 4.1.13 Robust Estimation ######################################## # Example 1: Normal distribution with an outlier. Lets construct... x <- rnorm(20,10,3) mean(x) x[21] <- 1000 mean(x) library(MASS) unlist(huber(x)) library(boot) boot.mean <- boot(x, function(x,i){mean(x[i])}, R=9999) jack.after.boot(boot.mean) ######################################## # 4.1.14 Type I and type II errors ######################################## ######################################## # 4.1.15 Power of a statistical test ######################################## # Example 1. With respect to the normal distribution it is... qnorm(0.95,0,1) pnorm(1.644854,3.5,1) # Example 2: Teaching demonstration. To further visualize... library(TeachingDemos) run.power.examp() # Example 3. If in case of a one sample t-Test an estimate... power.t.test(n=4, delta=300, sd=175, type = c("one.sample")) ######################################## # 4.2 Maximum likelihood Estimation ######################################## # Example 5. Various parameters can be estimated... x <- rpois(100,4) fitdistr(x,"Poisson") x <- c(6003, 6304, 6478, 6245, 6134) fitdistr(x,"Normal") x <- rexp(40000,5) fitdistr(x,"exponential") # Example 6. Frequently it is assumed that the data are... n <- 100 data <- matrix(0,nrow=n,ncol = 1) for (i in 1:n) { p <- sample(c(0,1), 1, replace = TRUE, prob=c(0.4,0.6)) data[i] <- p * rnorm(1,3000,300) + (1-p) * rnorm(1,6000,300) } dmixnor <- function(x,p,m1,s1,m2,s2){p * dnorm(x,m1,s1) + (1-p) * dnorm(x,m2,s2)} fitdistr(data,dmixnor,list(p=0.6,m1=3000,s1=300,m2=6000,s2=300)) ######################################## # 4.3 Application of tests to a whole gene set ######################################## # Example 1. Having a data matrix with gene expression values, a ques-... data(golub,package="multtest") golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) sh <- apply(golub[,golubFactor=="ALL"], 1, function(x) shapiro.test(x)$p.value) sum(sh > 0.05)/nrow(golub) * 100 # Example 2. In case the gene expression data are non-normally dis-... data(golub, package = "multtest") golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) pt <- apply(golub, 1, function(x) t.test(x ~ golubFactor)$p.value) pw <- apply(golub, 1, function(x) wilcox.test(x ~ golubFactor)$p.value) result <- data.frame(cbind(pw,pt)) result[pw<0.05 & abs(pt-pw)>0.2,]