############################################################################### ############################################################################### # Chapter 10 ############################################################################### ############################################################################### ######################################## # 10.1 Random sampling ######################################## # Example 1. Throwing a coin. A fair coin X attains Head and Tail with... noquote(sample(c("H","T"),30,rep=TRUE,prob=c(0.5,0.5))) # Example 2. Generating a sequence of nucleotides. Another example is... noquote(sample(c("A","G","C","T"),30,rep=TRUE,prob=c(0.1,0.4,0.4,0.1))) ######################################## # 10.2 Probability transition matrix ######################################## # Example 1. Using the probability transition matrix to generate a Markov... markov <- function(StateSpace,P,n, pi0=NULL, initState=NULL){ seq <- character(n) if (is.null(pi0)) { seq[1] <- initState } else { seq[1] <- sample(StateSpace, 1, replace=TRUE, pi0) } for(k in 1:(n-1)) { seq[k+1] <- sample(StateSpace, 1, replace=TRUE, P[seq[k],]) } return(seq) } P <- matrix(c(1/6,5/6,0.5,0.5), 2, 2, byrow=TRUE) rownames(P) <- colnames(P) <- stateSpace <- c("Y","R") P noquote(markov(stateSpace,P,30,initState="Y")) # Example 2. A sequence with a large frequency of C and G. To illustrate... P <- matrix(c( 1/6,5/6,0,0, 1/8,2/4,1/4,1/8, 0,2/6,3/6,1/6, 0,1/6,3/6,2/6),4,4,byrow=TRUE) rownames(P) <- colnames(P) <- StateSpace <- c("A","G","C","T") P pi0 <- c(1/4,1/4,1/4,1/4) names(pi0) = StateSpace pi0 seq <- markov(StateSpace,P,1000, pi0=pi0) table(seq) # Example 3. A sequence with high phenylalanine frequency. Now it is... P <- matrix(c(.01,.01,.01,.97, .01,.01,.01,.97, .01,.01,.01,.97, .01,.28,.01,0.70),4,4,byrow=T) rownames(P) <- colnames(P) <- StateSpace <- c("A","G","C","T") P pi0 <- c(1/4,1/4,1/4,1/4) names(pi0) = StateSpace pi0 seq <- markov(StateSpace,P,30000, pi0=pi0) table(getTrans(seq)) # Example 4. To illustrate estimation of the probability transition matrix... nr <- count(tolower(seq),2) nr A <- matrix(NA,4,4) A[1,1] <- nr["aa"]; A[1,2]<-nr["ag"]; A[1,3]<-nr["ac"]; A[1,4]<-nr["at"] A[2,1] <- nr["ga"]; A[2,2]<-nr["gg"]; A[2,3]<-nr["gc"]; A[2,4]<-nr["gt"] A[3,1] <- nr["ca"]; A[3,2]<-nr["cg"]; A[3,3]<-nr["cc"]; A[3,4]<-nr["ct"] A[4,1] <- nr["ta"]; A[4,2]<-nr["tg"]; A[4,3]<-nr["tc"]; A[4,4]<-nr["tt"] rowsumA <- apply(A, 1, sum) Phat <- sweep(A, 1, rowsumA, FUN="/") rownames(Phat) <- colnames(Phat) <- c("a","g","c","t") round(Phat,3) ######################################## # 10.3 Properties of the transition matrix ######################################## # Example 1. Matrix multiplication to compute probabilities. Suppose... P <- matrix(c(5/6,1/6,0.5,0.5),2,2,byrow=TRUE) pi0 <- c(2/3,1/3) pi0 %*% P # Example 3. Given the probability matrix of the previous example, the... P %*% P ######################################## # 10.4 Stationary distribution ######################################## install.packages(c("DTMCPack"),repo="http://cran.r-project.org",dep=TRUE) library(DTMCPack) # Suite of functions related to discrete-time discrete-state Markov Chains # Example 1. Stationary distribution. To compute the eigen-decomposition... P <- matrix(c(1/6,5/6,0.5,0.5),2,2,byrow=TRUE) V <- eigen(P,symmetric = FALSE) V$values V$vectors Pto5 = V$vec %*% diag(V$va)^(5) %*% solve(V$vec) # after 5 iterations: row1=row2, each row is the same unique stationary distribution Pto5 # Both rows give the same stationary distribution, therefore the markov model is ergodic Pto16 = V$vec %*% diag(V$va)^(16) %*% solve(V$vec) # after 16 iterations: row1=row2, each row is the same unique stationary distribution Pto16 # Both rows give the same stationary distribution, therefore the markov model is ergodic # calculate the stationary distribution statdistr(P) # Example 2. Diploid. Suppose A is a dominant gene, a a recessive... P <- matrix(c(1,0,0, 1/4,1/2,1/4,0,0,1),3,3,byrow=TRUE) # Not an ergodic transition matrix! rownames(P) <- colnames(P) <- c("AA", "aA", "aa") P V <- eigen(P,symmetric = FALSE) V$va V$vec V$vec %*% diag(V$va)^(5) %*% solve(V$vec) V$vec %*% diag(V$va)^(10) %*% solve(V$vec) V$vec %*% diag(V$va)^(15) %*% solve(V$vec) Pto5 = V$vec %*% diag(V$va)^(5) %*% solve(V$vec) Pto5 # Here, each row gives a different stationary distribution!! The markov model is not ergodic Pto5 %*% P Pto60 = V$vec %*% diag(V$va)^(60) %*% solve(V$vec) Pto60 # Here, each row gives a different stationary distribution!! The markov model is not ergodic Pto60 %*% P pi0 <- c(0,1,0) # The initial distribution is all aA pi0 %*% Pto60 # gives us the 2nd row of Pto60 which is the stationary distribution for the pi0 initial distribution # By looking at Pto60 we can see that the 3 rows give alteast 3 different stationary dists for P statdistr(P) # call fails because the transition matrix is singular. ######################################## # 10.5 Phylogenetic distance ######################################## # Example 1. From a rate matrix to a probability transition matrix... library(Matrix) Q <- 0.2 * Matrix(c(-3,1,1,1,1,-3,1,1,1,1,-3,1,1,1,1,-3),4) rownames(Q) <- colnames(Q) <- c("A","G","C","T") P <- as.matrix(expm(Q)) round(Q,2) round(P,2) # Example 2. The K81 model. To compute the rate matrix of the K81... alpha <- 3/6; beta <- 2/6; gamma<- 1/6; Q <- matrix(data=NA,4,4) Q[1,2] <- Q[2,1] <- Q[3,4] <- Q[4,3] <- alpha Q[1,3] <- Q[3,1] <- Q[2,4] <- Q[4,2] <- beta Q[1,4] <- Q[4,1] <- Q[2,3] <- Q[3,2] <- gamma diag(Q) <- -(alpha + beta + gamma) Q Q <- Matrix(Q) P <- as.matrix(expm(Q)) P # Example 3. Stationarity for the JC69 model. Let's take ff = 1=5... library(Matrix) alpha <- 1/5; Q <- matrix(rep(alpha,16),4,4) diag(Q) <- -3 * alpha Q <- Matrix(Q) P <- as.matrix(expm(Q)) V <- eigen(P,symmetric = FALSE) V$vec %*% diag(V$va)^(50) %*% solve(V$vec) # we can raise the transition matrix to the 50th power statdistr(P) # or explicitely solve for the stationary distribution # Example 4. Distance between two sequences according to the JC69... library(ape); library(seqinr) accnr <- paste("AJ5345",26:27,sep="") seqbin <- read.GenBank(accnr, species.names = TRUE, as.character = FALSE) dist.dna(seqbin, model = "JC69") seq <- read.GenBank(accnr, species.names = TRUE, as.character = TRUE) p <- sum(seq$AJ534526!=seq$AJ534527)/1143 d <- -log(1-4*p/3)*3/4 d # Example 5. Phylogenetic tree of a series of downloaded sequences. To... library(ape); library(seqinr); library(phangorn) accnr <- paste("AJ5345",26:35,sep="") seq <- read.GenBank(accnr) names(seq) <- attr(seq, "species") dist <- dist.dna(seq, model = "K80") # Use distance-based methods to initialize Parsimony and ML methods treeUPGMA = upgma(dist) # UPGMA treeNJ = NJ(dist) # neighbor joining plot(treeUPGMA, main="UPGMA") plot(treeNJ, "unrooted", main="NJ") # Maximum Parsimony seqPhyDat = phyDat(seq) parsimony(treeUPGMA, seqPhyDat) parsimony(treeNJ, seqPhyDat) treePars = optim.parsimony(treeUPGMA, seqPhyDat) # Maximum Likelihood fit = pml(treeNJ, data=seqPhyDat) # get the initial model fit fit fitJC = optim.pml(fit, TRUE) # perform ML optimization using default Jukes-Cantor model as the starting tree logLik(fitJC) # Log-Likelihood of the model # use the function modelTest to compare different models via the AIC or BIC mt = modelTest(seqPhyDat) mt # retrieve the HKY+G+I model env <- attr(mt, "env") ls(envir=env) fit <- eval(get("HKY+G+I", env), env) # Calculate Bootstrapping bs = bootstrap.pml(fit, bs=100, optNni=TRUE, control = pml.control(trace = 0)) plotBS(fit$tree, bs) #Now we can plot the tree with the bootstrap support values on the edges add.scale.bar(length=0.01) ######################################## # 10.6 Hidden Markov Models ######################################## # Example 1. Occasionally dishonest casino. A casino uses a fair die most... hmm <- function(A,E,n,initState){ observationSet <- colnames(E) hiddenSet <- rownames(E) x <- sample(observationSet,1,replace=T,E[initState,]) h <- markov(hiddenSet,A,n,initState=initState) for(k in 1:(n-1)) {x[k+1] <- sample(observationSet,1,replace=T,E[h[k],])} out <- cbind(x,h) return(out) } E <- matrix(c(rep(1/6,6),rep(1/10,5),1/2),2,6,byrow=T) #emission matrix A <- matrix(c(0.95,0.05,0.1,0.9),2,2,byrow=TRUE) #transition matrix colnames(E) <- EmissionSpace <- c("1","2","3","4","5","6") rownames(E) <- StateSpace <- c("F","L") E rownames(A) <- colnames(A) <- StateSpace A statePath <- hmm(A,E,100,c("F")) dimnames(statePath) <- list(rep("",100),c("observation","hidden state")) noquote(t(statePath)) # Example 2. The viterbi algorithm can be programmed and applied... # library("Rmpfr") viterbi <- function(A,E,seq,pi0) { # v <- matrix(NA, nr=length(seq), nc=dim(A)[2]) v <- mpfrArray(NA, dim=c(length(seq),dim(A)[2]), precBits = 100) colnames(v) = rownames(E) v[1,] <- pi0 for(i in 2:length(seq)) { for (l in 1:dim(A)[1]) {v[i,l] <- E[l,seq[i]] * max(v[(i-1),] * A[,l])} } return(v) } vit <- viterbi(A,E,statePath[,1],c(1,0)) vitRowMax <- unlist(apply(vit, 1, function(x) names(x)[which.max(x)])) statePath <- cbind(statePath,vitRowMax) dimnames(statePath) <- list(rep("",100),c("observation","hidden state","predicted state")) noquote(t(statePath)) 1 - (sum(statePath[,2] == statePath[,3]) / nrow(statePath)) # Example 3. GC Isochores E <- matrix(c(0.15,0.35,0.35,0.15, 0.35,0.15,0.15,0.35),2,4,byrow=T) #emission matrix A <- matrix(c(0.99,0.01,0.005,0.995),2,2,byrow=TRUE) #transition matrix colnames(E) <- EmissionSpace <- c("A","G","C","T") rownames(E) <- StateSpace <- c("GC","AT") E rownames(A) <- colnames(A) <- StateSpace A statePath <- hmm(A,E,1000,c("AT")) dimnames(statePath) <- list(rep("",1000),c("observation","hidden state")) statePathFactor = factor(statePath[,2]) plot(x=1:length(statePathFactor), # 1 to n y=as.numeric(statePathFactor), # 1 or 2 based on factor type = "l", # line plot yaxt = "n", # do not plot y-axis xlab = "Chromosome Location", # x-axis label ylab = "GC vs AT Isochores", # y-axis label col = "magenta", # line color lwd = 2, # line width = 2 cex.lab=1.5 # make axis labels big ) axis(2, at=c(1,2), labels=c("AT","GC"), cex.axis=1.5) # plot y-axis with special labels quartz() vit <- viterbi(A,E,statePath[,1],c(0,1)) vitRowMax <- unlist(apply(vit, 1, function(x) names(x)[which.max(x)])) vitRowMaxFactor = factor(vitRowMax) plot(x=1:length(vitRowMaxFactor), # 1 to n y=as.numeric(vitRowMaxFactor), # 1 or 2 based on factor type = "l", # line plot yaxt = "n", # do not plot y-axis xlab = "Chromosome Location", # x-axis label ylab = "GC vs AT Isochores", # y-axis label col = "magenta", # line color lwd = 2, # line width cex.lab=1.5 # make axis labels big ) axis(2, at=c(1,2), labels=c("AT","GC"), cex.axis=1.5) # plot y-axis with special labels statePath <- cbind(statePath,vitRowMax) dimnames(statePath) <- list(rep("",1000),c("observation","hidden state","predicted state")) noquote(t(statePath)) 1 - (sum(statePath[,2] == statePath[,3]) / nrow(statePath)) ###################################################### # Using HMMs for CNV calling ###################################################### source("http://bioconductor.org/biocLite.R") biocLite("HMMcopy") vignette("HMMcopy") library("HMMcopy") # Loading HTS copy number data rfile <- system.file("extdata", "normal.wig", package = "HMMcopy") gfile <- system.file("extdata", "gc.wig", package = "HMMcopy") mfile <- system.file("extdata", "map.wig", package = "HMMcopy") normal_reads <- wigsToRangedData(rfile, gfile, mfile) normal_reads[1000:1010, ] # Correcting HTS copy number data normal_copy <- correctReadcount(normal_reads) normal_copy[1000:1010, ] # Visualizing the effects of correction par(cex.main = 0.7, cex.lab = 0.7, cex.axis = 0.7, mar = c(4, 4, 2, 0.5)) plotBias(normal_copy, pch = 20, cex = 0.5) # Visualizing corrected copy number profiles par(mar = c(4, 4, 2, 0)) plotCorrection(normal_copy, pch = ".") #Correcting and visualizing tumour copy number profiles tfile <- system.file("extdata", "tumour.wig", package = "HMMcopy") tumour_copy <- correctReadcount(wigsToRangedData(tfile, gfile, mfile)) par(mar = c(4, 4, 2, 0)) plotCorrection(tumour_copy, pch = ".") # Segmentation and Classification of Copy Number Profiles tumour_segments <- HMMsegment(tumour_copy) # Visualizing segments and classified states par(mfrow = c(1, 1)) par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0)) plotSegments(tumour_copy, tumour_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position") cols <- stateCols() # 6 default state colours legend("topleft", c("HOMD", "HETD", "NEUT", "GAIN", "AMPL", "HLAMP"), fill = cols, horiz = TRUE, bty = "n", cex = 0.5) # Improving segmentation performance default_param <- HMMsegment(tumour_copy, getparam = TRUE) default_param # Reducing the number of segments longseg_param <- default_param longseg_param$e <- 0.999999999999999 longseg_param$strength <- 1e30 longseg_segments <- HMMsegment(tumour_copy, longseg_param, verbose = FALSE) par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0)) plotSegments(tumour_copy, longseg_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position") legend("topleft", c("HOMD", "HETD", "NEUT", "GAIN", "AMPL", "HLAMP"), fill = cols, horiz = TRUE, bty = "n", cex = 0.5) # Adjusting copy number state ranges longseg_segments$mus longseg_param$mu par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0)) plotSegments(tumour_copy, longseg_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position") for(i in 1:nrow(longseg_segments$mus)) { abline(h = longseg_segments$mus[i ,ncol(longseg_segments$mus)], col = cols[i], lwd = 2, lty = 3) } abline(v = 7.68e7, lwd = 2, lty = 3) abline(v = 8.02e7, lwd = 2, lty = 3) newmu_param <- longseg_param newmu_param$mu <- c(-0.5, -0.4, -0.15, 0.1, 0.4, 0.7) newmu_segments <- HMMsegment(tumour_copy, newmu_param, verbose = FALSE) par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0)) plotSegments(tumour_copy, newmu_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position") # Understanding parameter convergence newmu_segments$mus longseg_param$mu # Overriding parameter convergence par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0)) newmu_param$m <- newmu_param$mu realmu_segments <- HMMsegment(tumour_copy, newmu_param, verbose = FALSE) plotSegments(tumour_copy, realmu_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position") # Normalizing tumour by normal copy number profiles somatic_copy <- tumour_copy # LOGARITHM IDENTITY: log(a) - log(b) == lob(a / b) somatic_copy$copy <- tumour_copy$copy - normal_copy$copy somatic_segments <- HMMsegment(somatic_copy, newmu_param, verbose = FALSE) par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0)) plotSegments(somatic_copy, somatic_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position")