############################################################################### # Answers to exercises in Chapter 2: Descriptive Statistics and Data Display ############################################################################### data(golub, package = "multtest") # 1. Illustration of mean and standard deviation. # (a) Use x<- c(1,1.5,2,2.5,3) and mean(x) and sd(x) to obtain # the mean is 2 and the standard deviation is 0.7905694. x <- c(1,1.5,2,2.5,3) mean(x) sd(x) # (b) Now the mean is 7.4 and dramatically increased the standard de- # viation 12.64615. # (c) The outlier increased the mean as well as the standard deviation. # 2. Comparing two genes. Take i <- 66 or i <- 790. # (a) Use boxplot(golub[i,]~gol.fac) to observe that 790 has three # outliers and 66 has no. golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) boxplot(golub[66,golubFactor=="ALL"]) boxplot(golub[790,golubFactor=="ALL"]) # (b) Use qqnorm(golub[i,golubFactor=="ALL"]) and qqline(golub[i,golubFactor=="ALL"]) # to observe that nearly all values of 66 are on the line, where as for # 790 the three outliers are way of the normality line. Hypothesis: # The expression values of 66 are normally distributed, but those of # row 790 are not. qqnorm(golub[66,golubFactor=="ALL"]) qqline(golub[66,golubFactor=="ALL"]) qqnorm(golub[790,golubFactor=="ALL"]) qqline(golub[790,golubFactor=="ALL"]) # (c) Use mean(golub[i,golubFactor=="ALL"]) and median(golub[i,golubFactor=="ALL"]). # The mean (-1.174024) is larger than the median (-1.28137) due to # outliers on the right hand side. For the gen in row 66 the mean is # 1.182503 and the median 1.23023. The differences are smaller. mean(golub[i,golubFactor=="ALL"]) median(golub[i,golubFactor=="ALL"]) # 3. Effect size. # (a) The size 11 is large, because the mean is eleven times larger than # the standard deviation. data(golub, package="multtest") golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) effectSize <- apply(golub[,golubFactor=="ALL"],1,function(x) mean(x)/sd(x)) o <- order(effectSize,decreasing=TRUE) effectSize[o[1:5]] golub.gnames[o[1:5],2] # (b) The robust variant can be defined by dividing the median by the # MAD. An alternative would be to divide the median by the IQR. # This gives other best genes indicating that the some genes may # have outliers that influence the outcome. robustEffectSize <- apply(golub[,golubFactor=="ALL"],1,function(x) median(x)/mad(x)) o <- order(robustEffectSize,decreasing=TRUE) robustEffectSize[o[1:5]] golub.gnames[o[1:5],2] # 4. Plotting gene expressions for CCND3 (Cyclin D3). The answers in the # script below. data(golub, package = "multtest") golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) ccnd3 = grep("CCND3",golub.gnames[ ,2], ignore.case = TRUE) stripchart(golub[ccnd3,] ~ golubFactor,method="jitter") stripchart(golub[ccnd3,] ~ golubFactor,method="jitter",vertical = TRUE) stripchart(golub[ccnd3,] ~ golubFactor,method="jitter",col=c("red", "blue"), vertical = TRUE) stripchart(golub[ccnd3,] ~ golubFactor,method="jitter",col=c("red", "blue"), pch="*",vertical = TRUE) title("CCND3 (Cyclin D3) expression value for ALL and AML patients") # 5. Box-and-Whiskers plot for CCND3 (Cyclin D3).. x11() # create plotting window on Windows and linux # quartz() # create plotting window on Mac OSX data(golub, package = "multtest") golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) ccnd3 = grep("CCND3",golub.gnames[ ,2], ignore.case = TRUE) boxstats = boxplot.stats(golub[ccnd3, golubFactor=="ALL"], coef = 1.5, do.conf = TRUE, do.out = TRUE) #finds values boxstats boxplot(golub[ccnd3, golubFactor=="ALL"], xlim=c(0,4)) # Use boxstats so that this code is reusable for any boxplot! arrows(2.0, boxstats$stats[1], 1.24, boxstats$stats[1]); text(2.5,boxstats$stats[1],"lower whisker") arrows(2.0, boxstats$stats[2], 1.24, boxstats$stats[2]); text(2.5,boxstats$stats[2],"first quartile") arrows(2.0, boxstats$stats[3], 1.24, boxstats$stats[3]); text(2.5,boxstats$stats[3],"Median") arrows(2.0, boxstats$stats[4], 1.24, boxstats$stats[4]); text(2.5,boxstats$stats[4],"third quartile") arrows(2.0, boxstats$stats[5], 1.24, boxstats$stats[5]); text(2.5,boxstats$stats[5],"upper whisker") arrows(2.0, boxstats$out[1], 1.24, boxstats$out[1]) ; text(2.5,boxstats$out[1],"Outlier") arrows(2.0, boxstats$out[2], 1.24, boxstats$out[2]) ; text(2.5,boxstats$out[2],"Outlier") arrows(2.0, boxstats$out[3], 1.24, boxstats$out[3]) ; text(2.5,boxstats$out[3],"Outlier") dev.copy2eps(device=x11,file="~/biology664/BoxplotWithExplanation.eps") # create plotting window on Windows and linux # dev.copy2eps(device=quartz,file="~/biology664/BoxplotWithExplanation.eps") # create plotting window on Mac OSX # 6. Box-and-whiskers plot of persons of Golub et al. (1999) data.. # (a) The medians are all around zero, the inter quartile range differ # only slightly, the minimal values are all around minus 1.5. All # persons have outliers near three. boxplot(data.frame(golub)) # (b) The means are very close to zero. The medians are all between # (-0:15383, 0:06922), so these are also close to zero. patientMean <- apply(golub,2,mean) patientMean patientMedian <- apply(golub,2,median) patientMedian # (c) The data seem preprocessed to have standard deviation equal to # one. The re-scaled IQR and MAD have slightly larger range. range(apply(golub,2,sd)) range(apply(golub,2,function(x) IQR(x)/1.349)) range(apply(golub,2,mad)) # 7. Oncogenes of Golub et al. (1999) data. # (a) Note that we need the transpose operator t to change rows into # columns. The script below will do. data(golub, package="multtest") golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML")) rowIndex <- grep("oncogene",golub.gnames[,2], ignore.case = TRUE) golubOncogenes <- golub[rowIndex,] golubOncogenes.gnames <- golub.gnames[rowIndex,] row.names(golubOncogenes) <- golubOncogenes.gnames[,3] boxplot(data.frame(t(golubOncogenes[,golubFactor=="ALL"]))) # (b) The plot gives a nice overview of the distributions of the gene # expressions values of the oncogene separately for the ALL and # the AML patients. Several genes behave similarly for ALL and # AML. Some are clearly distributed around zero, but others not. # Also, some have a small inter quartile ranges, while for others # this is large. A similar statement holds for outliers, some do not # have outliers, but others certainly have. Some gene show distinct # distributions between patient groups. For instance, the sixth has # ALL expressions around zero, but those for AML are larger than # zero. par(mfrow=c(2,1)) boxplot(data.frame(t(golubOncogenes[,golubFactor=="ALL"]))) title("box-and-whiskers plot for oncogenes of ALL patients ") boxplot(data.frame(t(golubOncogenes[,golubFactor=="AML"]))) title("box-and-whiskers plot for oncogenes of AML patients ") par(mfrow=c(1,1)) # 8. Descriptive statistics for the ALL gene expression values of the Golub # et al. (1999) data. # (a) The ranges indicate strong difference in means. The range of the # mean and of the median are similar. The bulk of the data seems # symmetric. range(apply(golub[,golubFactor=="ALL"],1,mean)) range(apply(golub[,golubFactor=="ALL"],1,median)) # (b) The range of the standard deviation is somewhat smaller than of # the re-scaled IQR and MAD. range(apply(golub[,golubFactor=="ALL"],1,sd)) range(apply(golub[,golubFactor=="ALL"],1,function(x) IQR(x)/1.349)) range(apply(golub[,golubFactor=="ALL"],1,mad))