############################################################################### # Answers to exercises in Chapter 9: Analyzing Sequences ############################################################################### # 1. Writing to a FASTA file. choosebank("genbank"); library(seqinr) ccnd3.human = query("ccnd3.human","sp=homo sapiens AND k=ccnd3@") ccnd3.human.DNAseqs <- sapply(ccnd3.human$req, getSequence) x1 <- DNAStringSet(c2s(ccnd3.human.DNAseqs[[1]])) write.XStringSet(x1, file="ccnd3.fa", format="fasta", width=80) ccnd3.human.DNAseqs.c2s <- sapply(ccnd3.human.DNAseqs, c2s) x1 <- DNAStringSet(ccnd3.human.DNAseqs.c2s) write.XStringSet(x1, file="ccnd3n.fa", format="fasta", width=80) # An alternative would be to use the write.dna function of the ape package. # 2. Dotplot of sequences. seq1 <- sample(c("A","G","C","T"),100,rep=TRUE,prob=c(0.1,0.4,0.4,0.1)) seq2 <- sample(c("A","G","C","T"),100,rep=TRUE,prob=c(0.1,0.4,0.4,0.1)) par(mfrow=c(1,2)) dotPlot(seq1, seq2, main = "Dot plot of 2 different random sequences\nwsize = 1, wstep = 1", wsize = 1, wstep = 1) dotPlot(seq1, seq1, main = "Dot plot of a random sequence with itself\nwsize = 1, wstep = 1", wsize = 1, wstep = 1) par(mfrow=c(1,1)) par(mfrow=c(1,2)) dotPlot(seq1, seq2, main = "Dot plot of 2 different random sequences\nwsize = 3, wstep = 1", wsize = 3, wstep = 1) dotPlot(seq1, seq1, main = "Dot plot of a random sequence with itself\nwsize = 3, wstep = 1", wsize = 3, wstep = 1) par(mfrow=c(1,1)) par(mfrow=c(1,2)) dotPlot(seq1, seq2, main = "Dot plot of 2 different random sequences\nwsize = 3, wstep = 3", wsize = 3, wstep = 3) dotPlot(seq1, seq1, main = "Dot plot of a random sequence with itself\nwsize = 3, wstep = 3", wsize = 3, wstep = 3) par(mfrow=c(1,1)) par(mfrow=c(1,2)) dotPlot(seq1, seq1, main = "Dot plot of a random sequence with itself\nwsize = 3, wstep = 3", wsize = 3, wstep = 3) dotPlot(seq1, seq1[100:1], main = "Dot plot of a random sequence with it's self-reverse\nwsize = 3, wstep = 3", wsize = 3, wstep = 3) par(mfrow=c(1,1)) x <- c("RPLWVAPDGHIFLEAFSPVYK") y <- c("RPLWVAPDGHIFLEAFSPVYK") z <- c("PLWISPSDGRIILESFSPLAE") choosebank("genbank"); library(seqinr) ccnd3.human = query("ccnd3.human","sp=homo sapiens AND k=ccnd3@") ccnd3.human.DNAseqs <- sapply(ccnd3.human$req, getSequence) sapply(ccnd3.human$req, getName) ccnd3.human.AAseqs <- sapply(ccnd3.human$req, getTrans) dotPlot(ccnd3.human.AAseqs[[1]], s2c("EEEVFPLAMN"), main = "Dot plot of two proteins\nwsize = 3, wstep = 3", wsize = 3, wstep = 3) dotPlot(ccnd3.human.AAseqs[[7]], ccnd3.human.AAseqs[[8]], main = "Dot plot of two proteins\nwsize = 3, wstep = 3", wsize = 3, wstep = 3) dotPlot(s2c(x), s2c(z), main = "Dot plot of two proteins\nwsize = 3, wstep = 3", wsize = 3, wstep = 3) # 3. Local alignment. library(seqinr); library(Biostrings); data(BLOSUM50) smith.waterman = function(x, y, sub, d) { # create the dynamic programming matrices F and G F <- matrix(data=NA,nrow=(length(y)+1),ncol=(length(x)+1)) G <- matrix(data=NA,nrow=(length(y)+1),ncol=(length(x)+1)) rownames(F) <- c("",y); colnames(F) <- c("",x) rownames(G) <- c("",y); colnames(G) <- c("",x) F[,1] <- rep(0,length(y)+1); F[1,] <- rep(0,length(x)+1) G[,1] <- rep(" ",length(y)+1); G[1,] <- rep(" ",length(x)+1) for (i in 2:(nrow(F))) for (j in 2:(ncol(F))) { diagonal = F[i-1,j-1]+sub[i-1,j-1] up = F[i-1,j]-d left = F[i,j-1]-d max = max(0, diagonal, up, left) F[i,j] <- max if (max == 0) {G[i,j] = " "} else if (diagonal == max) {G[i,j] = "d"} else if (up == max) {G[i,j] = "u"} else {G[i,j] = "l"} } # Now perform traceback and alignment maxF = which(F == max(F), arr.ind = TRUE); i=maxF[1]; j=maxF[2]; maxL = max(length(x)+length(y)); ii=maxL; A1=vector(,maxL); A2=vector(,maxL); repeat{ if (G[i,j] == "d") {G[i,j] = "D"; A1[ii] = x[j-1]; A2[ii] = y[i-1]; i=i-1; j=j-1} else if (G[i,j] == "u") {G[i,j] = "U"; A1[ii] = "-"; A2[ii] = y[i-1]; i=i-1} else {G[i,j] = "L"; A1[ii] = x[j-1]; A2[ii] = "-"; j=j-1} ii=ii-1 if (F[i,j] == 0){ break } } return(list(ScoreMatrix=F, TracebackMatrix=G, Alignment=rbind(A1,A2)[,(ii+1):maxL], Score=F[maxF])) } x <- s2c("HEAGAWGHEE"); y <- s2c("PAWHEAE"); s <- BLOSUM50[y,x]; d <- 8 dpMatrix = smith.waterman(x,y,s,d) dpMatrix # 4. Probability of more extreme alignment score. library(seqinr); library(Biostrings); data(BLOSUM50) randallscore <- c(1,1) for (i in 1:1000) { x <- c2s(sample(rownames(BLOSUM50),7, replace=TRUE)) y <- c2s(sample(rownames(BLOSUM50),10, replace=TRUE)) randallscore[i] <- pairwiseAlignment(AAString(x), AAString(y), substitutionMatrix, gapOpening = 0, gapExtension = -8, scoreOnly = TRUE) } sum(randallscore>1)/1000 plot(density(randallscore)) # 5. Prochlorococcus marinus. library(seqinr) choosebank("genbank") ccmp = query("ccmp","AC=AE017126 OR AC=BX548174 OR AC=BX548175") ccmp.DNAseqs <- sapply(ccmp$req,getSequence) ccmp.DNAseqs.gc <- sapply(ccmp.DNAseqs, GC) wilcox.test(ccmp.DNAseqs.gc[1:2],ccmp.DNAseqs.gc[3:9]) t.test(ccmp.DNAseqs.gc[1:2],ccmp.DNAseqs.gc[3:9]) # 6. Sequence equality. library(seqinr) choosebank("genbank") ccnd3.human = query("ccnd3.human","sp=homo sapiens AND k=ccnd3@") sapply(ccnd3.human$req,getLength) ccnd3.human.AAseqs <- sapply(ccnd3.human$req, getTrans) table(ccnd3.human.AAseqs[[1]]) table(ccnd3.human.AAseqs[[2]]) which(!ccnd3.human.AAseqs[[1]]==ccnd3.human.AAseqs[[2]]) # 7. Conserved region. # ID XRODRMPGMNTB, BLOCK # AC PR00851A, distance from previous block=(52,131) # DE Xeroderma pigmentosum group B protein signature # BL adapted, width=21, seqs=8, 99.5%=985, strength=1287 # XPB_HUMAN|P19447 ( 74) RPLWVAPDGHIFLEAFSPVYK 54 # XPB_MOUSE|P49135 ( 74) RPLWVAPDGHIFLEAFSPVYK 54 # P91579 ( 80) RPLYLAPDGHIFLESFSPVYK 67 # XPB_DROME|Q02870 ( 84) RPLWVAPNGHVFLESFSPVYK 79 # RA25_YEAST|Q00578 ( 131) PLWISPSDGRIILESFSPLAE 100 # Q38861 ( 52) RPLWACADGRIFLETFSPLYK 71 # O13768 ( 90) PLWINPIDGRIILEAFSPLAE 100 # O00835 ( 79) RPIWVCPDGHIFLETFSAIYK 86 library(Biostrings); data(BLOSUM50) x <- c("RPLWVAPDGHIFLEAFSPVYK") y <- c("RPLWVAPDGHIFLEAFSPVYK") z <- c("PLWISPSDGRIILESFSPLAE") x == y pairwiseAlignment(AAString(x), AAString(y), substitutionMatrix = "BLOSUM50", gapOpening = 0, gapExtension = -8, scoreOnly = FALSE) pairwiseAlignment(AAString(x), AAString(z), substitutionMatrix = "BLOSUM50", gapOpening = 0, gapExtension = -8, scoreOnly = FALSE) # 8. Plot of CG proportion from C. elegans. # (a) Produce a plot of the CG proportion of the chromosome I of C. el- # egans (Celegans.UCSC.ce2) along a window of 100 nucleotides. # Take the first 10,000 nucleotides. library(seqinr) source("http://bioconductor.org/biocLite.R") biocLite("BSgenome.Celegans.UCSC.ce2") library(BSgenome.Celegans.UCSC.ce2) GCperc <- double() for (i in 1:10000) { GCperc[i] <- GC(s2c(as.character(Celegans$chrI[i:(i+100)]))) } plot(GCperc,type="l") # (b) A binding sequence of the enzyme EcoRV is the subsequence # GATATC. How many exact matches has Chromosome I of C. elegans. subseq <- "gatatc" countPattern(subseq, Celegans$chrI, max.mismatch = 0) length(s2c(as.character(Celegans$chrI))) * (1/4)^6 # 9. Plot of codon usage. data(ec999) ec999.uco <- lapply(ec999, uco, index="eff") df <- as.data.frame(lapply(ec999.uco, as.vector)) row.names(df) <- names(ec999.uco[[1]]) dotchart.uco(rowSums(df), main = "Codon usage in 999 E. coli coding sequences") choosebank("genbank"); library(seqinr) ccnd.human = query("ccnd.human","sp=homo sapiens AND k=ccnd@") ccnd.human.DNAseqs <- sapply(ccnd.human$req, getSequence) ccnd.human.DNAseqs.uco <- lapply(ccnd.human.DNAseqs, uco, index="eff") df <- as.data.frame(lapply(ccnd.human.DNAseqs.uco, as.vector)) row.names(df) <- names(ccnd.human.DNAseqs.uco[[1]]) dotchart.uco(rowSums(df), main = "Codon usage in ccnd homo sapiens coding sequences")