We will start with a built-in data set called MLL.B from the... source("http://bioconductor.org/biocLite.R") biocLite("affy") biocLite("affyPLM") biocLite("ALLMLL") biocLite("genefilter") biocLite("limma") biocLite("annaffy") biocLite("hgu95av2.db") biocLite("GEOquery") biocLite("GO.db") biocLite("GO") biocLite("made4") library(affy) library(ALLMLL) data(MLL.B, package = "ALLMLL") MLL.B class(MLL.B) ?MLL.B MLL.B str(MLL.B) slotNames(MLL.B) dim(exprs(MLL.B)) annotation(MLL.B) probeNames(MLL.B)[1:10] geneNames(MLL.B)[1:10] pm(MLL.B,"200000_s_at")[1:4,1:3] # par(mfrow=c(2,2)) matplot(pm(MLL.B,"200000_s_at"),type="l", xlab="Probe No.", ylab="PM Probe intensity") # matplot(pm(MLL.B,"200000_s_at") - mm(MLL.B,"200000_s_at"),type="l", xlab="Probe No.", ylab="PM Probe intensity") hist(MLL.B) # MA plots for the 6th sample relative to the pseudo-median reference MAplot(MLL.B, which=c(6), cex.lab=1.5) # smooth scatter MA plots for the first 4 samples relative to the pseudo-median reference par(mfrow=c(2,2)) # split the window into 4 parts to plot to sequentially MAplot(MLL.B, which=c(1,2,3,4), plot.method= "smoothScatter") # smooth scatter MA plots for the first 4 samples relative to sample 5 MAplot(MLL.B, which=c(1,2,3,4), ref=5, plot.method= "smoothScatter") par(mfrow=c(1,1)) # revert the canvas back to just one plot per window image(MLL.B) # hit the escape key to exit out # Visualizing RNA degradation degrade <- AffyRNAdeg(MLL.B) plotAffyRNAdeg(degrade, col=1:20) ######################################## # 6.2 Preprocessing methods ######################################## bgcorrect.methods() pmcorrect.methods() normalize.methods(MLL.B) express.summary.stat.methods() # Example 1. The three pre-processing steps can be employed one after... eset <- expresso(MLL.B, bgcorrect.method="rma", normalize.method="constant", pmcorrect.method="pmonly", summary.method="avgdiff") # Example 2. Another frequently applied preprocessing method is RMA.... library(affy) library(affyPLM) data(MLL.B, package = "ALLMLL") eset2 <- rma(MLL.B) mybreaks = seq(from=0, to=max(exprs(eset2)), length.out=30) hists = apply(exprs(eset2), 2, function(x) {h = hist(x, breaks=mybreaks, plot=FALSE); h$density}) hists.matrix = matrix(unlist(hists), ncol = 20, byrow = FALSE) matplot(hists.matrix, type="l") library(limma) limma::plotMA(eset2, array=6, # plot array 5 main=NULL, # No title cex.lab=1.5 # Make axis labels big ) MAplot(eset2, which=c(6), cex.lab=1.5) # Example 3. In the sequel we shall frequently work with the ALL data... data(ALL, package = "ALL") table(ALL$BT) ?ALL::ALL slotNames(ALL) row.names(exprs(ALL))[1:10] ALL1pp <- ALL1 <- ALL[,ALL$mol == "ALL1/AF4"] mads <- apply(exprs(ALL1), 2, mad) meds <- apply(exprs(ALL1), 2, median) dat <- sweep(exprs(ALL1), 2, meds) exprs(ALL1pp) <- sweep(dat, 2, mads, FUN="/") class(ALL1) boxplot(exprs(ALL1)) boxplot(exprs(ALL1pp)) # The R package "made4" contains a command that will create a quick overview and automatically generate multiple plots # in a single command library(made4) overview(MLL.B) ######################################## # 6.3 Gene filtering ######################################## # Example 1. Filtering by the coefficient of variation. A manner to filter... cvval <- apply(exprs(ALL1pp),1,function(x){sd(x)/abs(mean(x))}) sum(cvval < 0.2) ALL1pp[cvval<0.2,] # Example 2. Combining several filters. It is often desired to combine... library("genefilter") f1 <- function(x)(IQR(x)>0.5) f2 <- pOverA(.25, log2(100)) f3 <- function(x) (median(2^x) > 300) f4 <- function(x) (shapiro.test(x)$p.value > 0.05) f5 <- function(x) (sd(x)/abs(mean(x))<0.1) f6 <- function(x) (sqrt(10)* abs(mean(x))/sd(x) > qt(0.975,9)) ff <- filterfun(f1,f2,f3,f4,f5,f6) library("ALL"); data(ALL) selected <- genefilter(exprs(ALL[,ALL$BT=="B"]), ff) sum(selected) # Example 3. Filtering by t-test and normality. One may also want to... library("genefilter");library("ALL"); data(ALL) patientB <- factor(ALL$BT %in% c("B","B1","B2","B3","B4")) f1 <- function(x) (shapiro.test(x)$p.value > 0.05) f2 <- function(x) (t.test(x ~ patientB)$p.value < 0.05) sel1 <- genefilter(exprs(ALL[,patientB==TRUE]), filterfun(f1)) sel2 <- genefilter(exprs(ALL[,patientB==FALSE]), filterfun(f1)) preSelected <- sel1 & sel2 preSelectedALLs <- ALL[preSelected,] sel3 <- genefilter(exprs(preSelectedALLs), filterfun(f2)) selectedALLs <- preSelectedALLs[sel3,] dim(selectedALLs) library(limma) sel3 <- genefilter(exprs(ALL), filterfun(f2)) x <- matrix(as.integer(c(sel1,sel2,sel3)),ncol = 3,byrow=FALSE) colnames(x) <- c("sel1","sel2","sel3") vc <- vennCounts(x, include="both") vennDiagram(vc, circle.col=c("blue","red","green"), lwd=3) library("gplots") heatmap.2(exprs(ALL), scale = "row", col=greenred(75), dendrogram="both", key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5) ######################################## # 6.4 Applications of linear models ######################################## # Example 1. Analysis of variance. We select patients with B-cell leukemia in... library("ALL"); library("limma"); data(ALL, package = "ALL") allB <- ALL[,which(ALL$BT %in% c("B","B1","B2"))] design.ma <- model.matrix(~ 0 + factor(allB$BT)) colnames(design.ma) <- c("B","B1","B2") fit <- lmFit(allB, design.ma) fit <- eBayes(fit) toptab <- topTable(fit, coef=NULL,5,adjust.method="fdr") print(toptab[,1:6],digits=4) cont.ma <- makeContrasts(B-B1,B1-B2, levels=factor(allB$BT)) cont.ma fit1 <- contrasts.fit(fit, cont.ma) fit1 <- eBayes(fit1) toptabcon <- topTable(fit1, coef=NULL,5,adjust.method="fdr") print(toptabcon[,1:5],digits=4) # Example 2. Summarizing output in HTML format. It is often desired to... library("annaffy"); library("hgu95av2.db") anntable <- aafTableAnn(as.character(row.names(toptabcon)), "hgu95av2.db", aaf.handler()) saveHTML(anntable, "~/biology664/ALLB123.html", title = "B-cell 012 ALL") # Example 3. Using basic R functions. It is also possible to summarize... library("multtest"); library("annaffy"); library("hgu95av2.db") library("ALL"); data(ALL, package = "ALL") ALLB <- ALL[,which(ALL$BT %in% c("B","B1","B2"))] panova <- apply(exprs(ALLB), 1, function(x) anova(lm(x ~ ALLB$BT))$Pr[1]) genenames <- featureNames(ALLB)[panova<0.000001] atab <- aafTableAnn(genenames, "hgu95av2.db", aaf.handler()[c(1:3,8:9,11:13)]) saveHTML(atab, file="~/biology664/ANOVAonB-cellGroups.html") getwd() # Example 4. Analyzing public available data. The GDS1365 data con-... library(GEOquery); library(limma); library(hgu95av2.db); library(annaffy) gds <- getGEO("GDS1365") eset <- GDS2eSet(gds,do.log2=TRUE) prot <- pData(eset)$protocol time <- pData(eset)$time pval <- apply(exprs(eset)[1:12625,], 1, function(x) anova(lm(x ~ prot * time))$Pr[1:3]) pvalt <- data.frame(t(pval)) colnames(pvalt) <- c("meffprot","mefftime","interaction") genenames <- featureNames(eset)[pvalt$meffprot< 0.01 & pvalt$mefftime < 0.01 & pvalt$interaction < 0.01] atab <- aafTableAnn(genenames,"hgu95av2.db",aaf.handler()[c(1:3,8:9,11:13)]) saveHTML(atab, file="~/biology664/Two-way ANOVA protocol by time.html") ######################################## # 6.5 Searching an annotation package ######################################## library("ALL"); data(ALL) annotation(ALL) library(hgu95av2.db) ls("package:hgu95av2.db") ChrNrOfProbe <- as.list(hgu95av2CHR) ChrNrOfProbe[1] ?hgu95av2CHR get("1389_at", env = hgu95av2ACCNUM) get("1389_at", env = hgu95av2ENTREZID) get("1389_at", env = hgu95av2SYMBOL) get("1389_at", env = hgu95av2GENENAME) get("1389_at", env = hgu95av2UNIGENE) library(annotate) genbank("J03779",disp="browser") genbank(179833,disp="data",type="uid") get("1389_at", env = hgu95av2CHRLOC) get("1389_at", env = hgu95av2MAP) ######################################## # 6.6 Using annotation to search literature ######################################## library(hgu95av2.db); library(annotate); library(ALL); data(ALL) pmid <- get("1389_at",env=hgu95av2PMID) pubmed(pmid,disp="browser") absts <- pm.getabst("1389_at", "hgu95av2") pm.titles(absts) ne <- pm.abstGrep("neutral endopeptidase",absts[[1]]) pmAbst2HTML(absts[[1]],filename="~/biology664/pmon1389_at.html") ######################################## # 6.7 Searching GO numbers and evidence ######################################## go1389 <- get("1389_at", env = hgu95av2GO) idl <- lapply(go1389,function(x) x$GOID) idl[[1]] library(annotate) getOntology(go1389,"BP") getEvidence(go1389) go1389TAS <- subset(go1389,getEvidence(go1389)=="TAS") sapply(go1389TAS,function(x) x$GOID) sapply(go1389TAS,function(x) x$Evidence) sapply(go1389TAS,function(x) x$Ontology) ######################################## # 6.8 GO parents and children ######################################## # Example 1. Collecting GO information. There are functions to obtain... GOMFPARENTS$"GO:0003700" GOMFCHILDREN$"GO:0003700" go1389 <- get("1389_at", env = hgu95av2GO) gonr <- getOntology(go1389, "BP") gP <- getGOParents(gonr) gC <- getGOChildren(gonr) gPC <- c(gonr,gP,gC) pa <- sapply(gP,function(x) x$Parents) ch <- sapply(gC,function(x) x$Children) gonrc <- c(gonr,unlist(pa),unlist(ch)) length(gonrc) # Example 2. Probe selection by GO. A research strategy may be to start... library(GO.db); library(annotate); library("ALL"); data(ALL) go1389 <- get("1389_at", env = hgu95av2GO) gonr <- getOntology(go1389, "BP") gP <- getGOParents(gonr) pa <- sapply(gP,function(x) x$Parents) probes <- mget(unlist(pa),hgu95av2GO2ALLPROBES) probeNames <- unlist(probes) ALLpr <- ALL[probeNames,] dim(exprs(ALLpr)) ######################################## # 6.9 Gene filtering by a biological term ######################################## # Example 1. Filter gene by a term. From a biological point of view... library("GO"); library("annotate"); library("hgu95av2.db") GOTerm2Tag <- function(term) { GTL <- eapply(GOTERM, function(x) {grep(term, x@Term, value=TRUE)}) Gl <- sapply(GTL, length) names(GTL[Gl>0]) } GOTerm2Tag("transcriptional repressor") GOTerm2Tag("repressor") tran1 <- hgu95av2GO2ALLPROBES$"GO:0003714" tran2 <- hgu95av2GO2ALLPROBES$"GO:0008231" tran3 <- hgu95av2GO2ALLPROBES$"GO:0017053" tran <- c(tran1,tran2,tran3) inboth <- tran %in% row.names(exprs(ALL)) ALLtran <- ALL[tran[inboth],] GOTERM$"GO:0017053" dim(exprs(ALLtran)) ######################################## # 6.10 Significance per chromosome ######################################## # Example 1. On the expression values of the ALL data we perform a two... library("ALL"); data(ALL); library("hgu95av2.db") rawp <- apply(exprs(ALL), 1, function(x) t.test(x ~ ALL$remission)$p.value) xx <- as.list(hgu95av2CHR) AffimIDChr19 <- names(xx[xx=="19"]) names(rawp) <- featureNames(ALL) f <- matrix(NA,2,2) f[1,1] <- sum(rawp[AffimIDChr19]<0.05); f[1,2] <- sum(rawp[AffimIDChr19]>0.05) f[2,1] <- sum(rawp<0.05) - f[1,1] ; f[2,2] <- sum(rawp>0.05) - f[1,2] print(f) fisher.test(f) chisq.test(f)