############################################################################### ############################################################################### # Chapter 2 ############################################################################### ############################################################################### library(ALL) data(ALL) library(multtest) data(golub) ######################################## # 2.1 Descriptive statistics ######################################## ######################################## # 2.1.1 Measures of central tendency ######################################## # Example 1. To compute the mean and median of the ALL expression values... ccnd3 = grep("CCND3",golub.gnames[ ,2], ignore.case = TRUE) ccnd3 golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) mean(golub[ccnd3, golubFactor=="ALL"]) mean(golub[ccnd3, golubFactor=="ALL"], trim=.10) #drop the top and bottom 10% of datapoints median(golub[ccnd3, golubFactor=="ALL"]) ######################################## # 2.1.2 Measures of spread ######################################## # Example 1. These measures of spread for the ALL expression values... sd(golub[ccnd3, golubFactor=="ALL"]) IQR(golub[ccnd3, golubFactor=="ALL"]) / 1.349 mad(golub[ccnd3, golubFactor=="ALL"]) ######################################## # 2.1.2 Measures of correlation ######################################## # Order rows by decreasing difference between the mean expression between ALL and AML patients golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) meanALL <- apply(golub[,golubFactor=="ALL"], 1, mean) meanAML <- apply(golub[,golubFactor=="AML"], 1, mean) o <- order(abs(meanALL-meanAML), decreasing=TRUE) biomarkers50 = golub[o[1:50],] # Create a set of biomarkers out of the top 50 dim(biomarkers50) cor(biomarkers50[,33], biomarkers50[,37]) # compare 2 AML patients, method = Pearson's cor(biomarkers50[,1], biomarkers50[,33]) # campare an ALL to an AML patient, method = Pearson's cor(biomarkers50[,1], biomarkers50[,37]) # campare an ALL to another AML patient, method = Pearson's cor(biomarkers50[,33], biomarkers50[,37], method="spearman") cor(biomarkers50[,1], biomarkers50[,33], method="spearman") cor(biomarkers50[,1], biomarkers50[,37], method="spearman") ######################################## # 2.2 Data display ######################################## ######################################## # 2.2.1 Frequency table ######################################## # Example 1. A gene consists of a sequence of nucleotides A, C, G, T.... install.packages(c("ape"),repo="http://cran.r-project.org",dep=TRUE) library(ape) zyxin = read.GenBank(c("X94991.1"),as.character=TRUE) zyxinTable = table(zyxin) zyxinTable percentLabels = round(100*zyxinTable/sum(zyxinTable), 1) # 1 decimal place pieLabels = paste(percentLabels, "%", c(" A"," C"," G"," T"), sep="") pie(zyxinTable, # frequency table cex=1.5, # make labels big col=rainbow(4), # 4 very distant colors from the color spectrum labels=pieLabels) # nucleotide labels ######################################## # 2.2.2 Plotting data ######################################## # Example 1. Many visualization methods will be illustrated by the Golub data... data(golub, package = "multtest") plot(golub[ccnd3,], # values pch=19, # plot solid circles cex.lab=1.5, # make axis labels big xlab="Patient number", ylab="CCND3 (Cyclin D3) expression", col="blue") stripchart(golub[ccnd3,] ~ golubFactor, # values method="jitter", # add random horizontal jitter cex.lab=1.5, # make axis labels big vertical = TRUE, # boxplots vertical col=c("red", "darkgreen"), xlab="Leukemia subtype", ylab=NULL) ######################################## # 2.2.3 Histogram ######################################## # Example 1. A histogram of the expression values of gene "CCND3"... hist(golub[ccnd3, golubFactor=="ALL"], # values cex.lab=1.5, # make axis labels big main=NULL, # no title xlab="CCND3 (Cyclin D3) Expression", col="cyan") ######################################## # 2.2.4 Boxplot ######################################## # Example 1. A view on the distribution of the expression values... boxplot(golub[ccnd3,] ~ golubFactor, # values cex.lab=1.5, # make axis labels big main=NULL, # no title xlab="Leukemia subtype", ylab="CCND3 (Cyclin D3) Expression", col=c("purple","green")) pvec <- seq(0,1,0.25) quantile(golub[ccnd3, golubFactor=="ALL"],pvec) min(golub[ccnd3, golubFactor=="ALL"]) max(golub[ccnd3, golubFactor=="ALL"]) range(golub[ccnd3, golubFactor=="ALL"]) ######################################## # 2.2.5 Heatmaps ######################################## # install.package("gplots") # install.package("RColorBrewer") library("gplots") library("RColorBrewer") # Row scaling and no dendrogram (clustering) heatmap.2(biomarkers50, # values Rowv = NA, # no row clustering Colv = NA, # no column clustering scale = "row", # row-wise scaling col=greenred(75), # green to red heatmap dendrogram="none", # no dendrogram key=TRUE, # add color key symkey=FALSE, density.info="none", trace="none", cexRow=0.5) # make row labels small # Row scaling and row and column dendrograms (clustering) heatmap.2(biomarkers50, # values scale = "row", # row-wise scaling col=colorRampPalette(c("blue", "yellow"))(20), # blue to yellow heatmap dendrogram="both", # add row and column dendrogram key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5) # make row labels small # Repeat the clustering using median medianALL <- apply(golub[,golubFactor=="ALL"], 1, median) medianAML <- apply(golub[,golubFactor=="AML"], 1, median) o <- order(abs(medianALL - medianAML), decreasing=TRUE) robustBiomarkers50 = golub[o[1:50],] # Create a set of biomarkers out of the top 50 # Row scaling and row and column dendrograms (clustering) heatmap.2(robustBiomarkers50, # values scale = "row", # row-wise scaling col=bluered(75), # blue to red heatmap dendrogram="both", # add row and column dendrogram key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5) # make row labels small ######################################## # 2.2.6 Quantile-Quantile plot ######################################## # Example 1. To produce a Q-Q plot of the ALL gene expression values... qqnorm(golub[ccnd3, golubFactor=="ALL"], #values pch=19, # plot solid circles cex.lab=1.5, # make axis labels big col="red", main=NULL); # no title qqline(golub[ccnd3, golubFactor=="ALL"]) ######################################## # 2.2.7 Combined plots ######################################## hist(golub[ccnd3, golubFactor=="ALL"], # values breaks=9, # number of breaks (bins-1) prob=TRUE, # scale probability density function xlim=c(0,4), # x axis range ylim=c(0,2), # y axis range cex.lab=1.5, # make axis labels big main=NULL, # no title xlab="CCND3 (Cyclin D3) Expression", col="cyan") m<-mean(golub[ccnd3, golubFactor=="ALL"]) std<-sqrt(var(golub[ccnd3, golubFactor=="ALL"])) curve(dnorm(x, mean=m, sd=std), # fit a normal curve col="darkblue", lwd=2, # width 2 add=TRUE, # add to existing plot yaxt="n") # don't add to existing y axis # source("http://www.bioconductor.org/biocLite.R") # biocLite("edgeR") library(edgeR) maPlot(meanALL, meanAML, normalize=FALSE, lowess=TRUE, # plot Loess curve pch=19, # plot solid circles cex=0.1, # make solid circles very small cex.lab=1.5, # make axis labels big logAbundance=.5*(meanAML+meanALL), # X-values logFC=meanAML/meanALL, # Y-values xlab="1/2 (mean(AML) + mean(ALL))", ylab="mean(AML) / mean(ALL)") abline(h=1, lwd=2, lty=2, col="blue") # add horizontal line abline(v=0, lwd=3, lty=3, col="green") # add vertical line maPlot(meanAML, meanALL, normalize=FALSE, lowess=TRUE, # plot Loess curve pch=19, # plot solid circles cex=0.1, # make solid circles very small cex.lab=1.5, # make axis labels big logAbundance=.5*(meanAML+meanALL), # X-values logFC=meanALL/meanAML, # Y-values xlab="1/2 (mean(AML) + mean(ALL))", ylab="mean(ALL) / mean(AML)") abline(h=1, lwd=2, lty=2, col="blue") # add horizontal line abline(v=0, lwd=3, lty=3, col="green") # add vertical line x = meanAML-meanALL y = apply(golub, 1, function(x) {cor(x, golub.cl)} ) plot(x, # X-values y, # Y-values pch=19, # plot solid circles cex=0.1, # make solid circles very small cex.lab=1.5, # make axis labels big xlab="mean(AML) - mean(ALL)", ylab="Pearson's Correlation with Diagnosis") lines(lowess(x,y), col="orange", lwd=4) # add lowess line (x,y) abline(h=0, lwd=2, lty=2, col="blue") # add horizontal line abline(v=0, lwd=2, lty=2, col="red") # add vertical line x = abs(meanAML-meanALL) y = apply(golub, 1, function(x) {abs(cor(x, golub.cl))} ) plot(x, # X-values y, # Y-values pch=19, # plot solid circles cex=0.1, # make solid circles very small cex.lab=1.5, # make axis labels big xlab="abs(mean(AML) - mean(ALL))", ylab="Absolute Value of Pearson's Correlation with Diagnosis") lines(lowess(x,y), col="orange", lwd=4) # add lowess line (x,y) abline(h=0, lwd=2, lty=2, col="blue") # add horizontal line abline(v=0, lwd=2, lty=2, col="red") # add vertical line