###############################################################################
###############################################################################
# Chapter 10
###############################################################################
###############################################################################

########################################
# 10.1 Random sampling
########################################

# Example 1. Throwing a coin. A fair coin X attains Head and Tail with...
noquote(sample(c("H","T"),30,rep=TRUE,prob=c(0.5,0.5)))

# Example 2. Generating a sequence of nucleotides. Another example is...
noquote(sample(c("A","G","C","T"),30,rep=TRUE,prob=c(0.1,0.4,0.4,0.1)))

########################################
# 10.2 Probability transition matrix
########################################

# Example 1. Using the probability transition matrix to generate a Markov...
markov <- function(StateSpace,P,n, pi0=NULL, initState=NULL){
    seq <- character(n)

    if (is.null(pi0)) {
        seq[1] <- initState
    }
    else {
        seq[1] <- sample(StateSpace, 1, replace=TRUE, pi0)
    }

    for(k in 1:(n-1)) {
        seq[k+1] <- sample(StateSpace, 1, replace=TRUE, P[seq[k],])
    }

    return(seq)
}

P <- matrix(c(1/6,5/6,0.5,0.5), 2, 2, byrow=TRUE)
rownames(P) <- colnames(P) <- stateSpace <- c("Y","R")
P
noquote(markov(stateSpace,P,30,initState="Y"))

# Example 2. A sequence with a large frequency of C and G. To illustrate...
P <- matrix(c(
  1/6,5/6,0,0,
  1/8,2/4,1/4,1/8,
  0,2/6,3/6,1/6,
  0,1/6,3/6,2/6),4,4,byrow=TRUE)
rownames(P) <- colnames(P) <- StateSpace <- c("A","G","C","T")
P

pi0 <- c(1/4,1/4,1/4,1/4)
names(pi0) = StateSpace
pi0

seq <- markov(StateSpace,P,1000, pi0=pi0)
table(seq)

# Example 3. A sequence with high phenylalanine frequency. Now it is...
P <- matrix(c(.01,.01,.01,.97,
              .01,.01,.01,.97,
              .01,.01,.01,.97,
              .01,.28,.01,0.70),4,4,byrow=T)
rownames(P) <- colnames(P) <- StateSpace <- c("A","G","C","T")
P

pi0 <- c(1/4,1/4,1/4,1/4)
names(pi0) = StateSpace
pi0

seq <- markov(StateSpace,P,30000, pi0=pi0)
table(getTrans(seq))

# Example 4. To illustrate estimation of the probability transition matrix...
nr <- count(tolower(seq),2)
nr
A <- matrix(NA,4,4)
A[1,1] <- nr["aa"]; A[1,2]<-nr["ag"]; A[1,3]<-nr["ac"]; A[1,4]<-nr["at"]
A[2,1] <- nr["ga"]; A[2,2]<-nr["gg"]; A[2,3]<-nr["gc"]; A[2,4]<-nr["gt"]
A[3,1] <- nr["ca"]; A[3,2]<-nr["cg"]; A[3,3]<-nr["cc"]; A[3,4]<-nr["ct"]
A[4,1] <- nr["ta"]; A[4,2]<-nr["tg"]; A[4,3]<-nr["tc"]; A[4,4]<-nr["tt"]
rowsumA <- apply(A, 1, sum)
Phat <- sweep(A, 1, rowsumA, FUN="/")
rownames(Phat) <- colnames(Phat) <- c("a","g","c","t")
round(Phat,3)

########################################
# 10.3 Properties of the transition matrix
########################################

# Example 1. Matrix multiplication to compute probabilities. Suppose...
P <- matrix(c(5/6,1/6,0.5,0.5),2,2,byrow=TRUE)
pi0 <- c(2/3,1/3)
pi0 %*% P

# Example 3. Given the probability matrix of the previous example, the...
P %*% P

########################################
# 10.4 Stationary distribution
########################################
install.packages(c("DTMCPack"),repo="http://cran.r-project.org",dep=TRUE)
library(DTMCPack) # Suite of functions related to discrete-time discrete-state Markov Chains

# Example 1. Stationary distribution. To compute the eigen-decomposition...
P <- matrix(c(1/6,5/6,0.5,0.5),2,2,byrow=TRUE)
V <- eigen(P,symmetric = FALSE)
V$values
V$vectors
Pto5 = V$vec %*% diag(V$va)^(5) %*% solve(V$vec) # after 5 iterations: row1=row2, each row is the same unique stationary distribution
Pto5 # Both rows give the same stationary distribution, therefore the markov model is ergodic

Pto16 = V$vec %*% diag(V$va)^(16) %*% solve(V$vec) # after 16 iterations: row1=row2, each row is the same unique stationary distribution
Pto16 # Both rows give the same stationary distribution, therefore the markov model is ergodic

# calculate the stationary distribution
statdistr(P)

# Example 2. Diploid. Suppose A is a dominant gene, a a recessive...
P <- matrix(c(1,0,0, 1/4,1/2,1/4,0,0,1),3,3,byrow=TRUE) # Not an ergodic transition matrix!
rownames(P) <- colnames(P) <- c("AA", "aA", "aa")
P
V <- eigen(P,symmetric = FALSE)
V$va
V$vec
V$vec %*% diag(V$va)^(5) %*% solve(V$vec)
V$vec %*% diag(V$va)^(10) %*% solve(V$vec)
V$vec %*% diag(V$va)^(15) %*% solve(V$vec)

Pto5 = V$vec %*% diag(V$va)^(5) %*% solve(V$vec)
Pto5 # Here, each row gives a different stationary distribution!! The markov model is not ergodic


Pto5 %*% P

Pto60 = V$vec %*% diag(V$va)^(60) %*% solve(V$vec)
Pto60 # Here, each row gives a different stationary distribution!! The markov model is not ergodic
Pto60 %*% P

pi0 <- c(0,1,0) # The initial distribution is all aA
pi0 %*% Pto60 # gives us the 2nd row of Pto60 which is the stationary distribution for the pi0 initial distribution

# By looking at Pto60 we can see that the 3 rows give alteast 3 different stationary dists for P
statdistr(P) # call fails because the transition matrix is singular.

########################################
# 10.5 Phylogenetic distance
########################################

# Example 1. From a rate matrix to a probability transition matrix...
library(Matrix)
Q <- 0.2 * Matrix(c(-3,1,1,1,1,-3,1,1,1,1,-3,1,1,1,1,-3),4)
rownames(Q) <- colnames(Q) <- c("A","G","C","T")
P <- as.matrix(expm(Q))
round(Q,2)
round(P,2)

# Example 2. The K81 model. To compute the rate matrix of the K81...
alpha <- 3/6; beta <- 2/6; gamma<- 1/6; Q <- matrix(data=NA,4,4)
Q[1,2] <- Q[2,1] <- Q[3,4] <- Q[4,3] <- alpha
Q[1,3] <- Q[3,1] <- Q[2,4] <- Q[4,2] <- beta
Q[1,4] <- Q[4,1] <- Q[2,3] <- Q[3,2] <- gamma
diag(Q) <- -(alpha + beta + gamma)
Q
Q <- Matrix(Q)
P <- as.matrix(expm(Q))
P

# Example 3. Stationarity for the JC69 model. Let's take ff = 1=5...
library(Matrix)
alpha <- 1/5; Q <- matrix(rep(alpha,16),4,4)
diag(Q) <- -3 * alpha
Q <- Matrix(Q)
P <- as.matrix(expm(Q))
V <- eigen(P,symmetric = FALSE)
V$vec %*% diag(V$va)^(50) %*% solve(V$vec) # we can raise the transition matrix to the 50th power
statdistr(P) # or explicitely solve for the stationary distribution

# Example 4. Distance between two sequences according to the JC69...
library(ape); library(seqinr)
accnr <- paste("AJ5345",26:27,sep="")
seqbin <- read.GenBank(accnr, species.names = TRUE, as.character = FALSE)
dist.dna(seqbin, model = "JC69")
seq <- read.GenBank(accnr, species.names = TRUE, as.character = TRUE)
p <- sum(seq$AJ534526!=seq$AJ534527)/1143
d <- -log(1-4*p/3)*3/4
d

# Example 5. Phylogenetic tree of a series of downloaded sequences. To...
library(ape); library(seqinr); library(phangorn)
accnr <- paste("AJ5345",26:35,sep="")
seq <- read.GenBank(accnr)
names(seq) <- attr(seq, "species")
dist <- dist.dna(seq, model = "K80")

# Use distance-based methods to initialize Parsimony and ML methods
treeUPGMA = upgma(dist) # UPGMA
treeNJ = NJ(dist) # neighbor joining

plot(treeUPGMA, main="UPGMA")
plot(treeNJ, "unrooted", main="NJ")

# Maximum Parsimony
seqPhyDat = phyDat(seq)
parsimony(treeUPGMA, seqPhyDat)
parsimony(treeNJ, seqPhyDat)

treePars = optim.parsimony(treeUPGMA, seqPhyDat)

# Maximum Likelihood
fit = pml(treeNJ, data=seqPhyDat) # get the initial model fit
fit
fitJC = optim.pml(fit, TRUE) # perform ML optimization using default Jukes-Cantor model as the starting tree
logLik(fitJC) # Log-Likelihood of the model

# use the function modelTest to compare different models via the AIC or BIC
mt = modelTest(seqPhyDat)
mt

# retrieve the HKY+G+I model
env <- attr(mt, "env")
ls(envir=env)
fit <- eval(get("HKY+G+I", env), env)

# Calculate Bootstrapping
bs = bootstrap.pml(fit, bs=100, optNni=TRUE, control = pml.control(trace = 0))
plotBS(fit$tree, bs) #Now we can plot the tree with the bootstrap support values on the edges
add.scale.bar(length=0.01)

########################################
# 10.6 Hidden Markov Models
########################################

# Example 1. Occasionally dishonest casino. A casino uses a fair die most...
hmm <- function(A,E,n,initState){
    observationSet <- colnames(E)
    hiddenSet <- rownames(E)
    x <- sample(observationSet,1,replace=T,E[initState,])
    h <- markov(hiddenSet,A,n,initState=initState)
    for(k in 1:(n-1)) {x[k+1] <- sample(observationSet,1,replace=T,E[h[k],])}
    out <- cbind(x,h)
    return(out)
}

E <- matrix(c(rep(1/6,6),rep(1/10,5),1/2),2,6,byrow=T) #emission matrix
A <- matrix(c(0.95,0.05,0.1,0.9),2,2,byrow=TRUE)       #transition matrix

colnames(E) <- EmissionSpace <- c("1","2","3","4","5","6")
rownames(E) <- StateSpace <- c("F","L")
E

rownames(A) <- colnames(A) <- StateSpace
A

statePath <- hmm(A,E,100,c("F"))
dimnames(statePath) <- list(rep("",100),c("observation","hidden state"))
noquote(t(statePath))

# Example 2. The viterbi algorithm can be programmed and applied...
# library("Rmpfr")
viterbi <- function(A,E,seq,pi0) {
    # v <- matrix(NA, nr=length(seq), nc=dim(A)[2])
    v <- mpfrArray(NA, dim=c(length(seq),dim(A)[2]), precBits = 100)
    colnames(v) = rownames(E)
    v[1,] <- pi0
    for(i in 2:length(seq)) {
        for (l in 1:dim(A)[1]) {v[i,l] <- E[l,seq[i]] * max(v[(i-1),] * A[,l])} }
    return(v)
}

vit <- viterbi(A,E,statePath[,1],c(1,0))
vitRowMax <- unlist(apply(vit, 1, function(x) names(x)[which.max(x)]))

statePath <- cbind(statePath,vitRowMax)
dimnames(statePath) <- list(rep("",100),c("observation","hidden state","predicted state"))
noquote(t(statePath))

1 - (sum(statePath[,2] == statePath[,3]) / nrow(statePath))

# Example 3. GC Isochores
E <- matrix(c(0.15,0.35,0.35,0.15,
              0.35,0.15,0.15,0.35),2,4,byrow=T) #emission matrix
A <- matrix(c(0.99,0.01,0.005,0.995),2,2,byrow=TRUE)       #transition matrix

colnames(E) <- EmissionSpace <- c("A","G","C","T")
rownames(E) <- StateSpace <- c("GC","AT")
E
rownames(A) <- colnames(A) <- StateSpace
A

statePath <- hmm(A,E,1000,c("AT"))
dimnames(statePath) <- list(rep("",1000),c("observation","hidden state"))

statePathFactor = factor(statePath[,2])
plot(x=1:length(statePathFactor),    # 1 to n
     y=as.numeric(statePathFactor),  # 1 or 2 based on factor
     type = "l",                     # line plot
     yaxt = "n",                     # do not plot y-axis
     xlab = "Chromosome Location",   # x-axis label
     ylab = "GC vs AT Isochores",    # y-axis label
     col = "magenta",                # line color
     lwd = 2,                        # line width = 2
     cex.lab=1.5                     # make axis labels big
     )
axis(2, at=c(1,2), labels=c("AT","GC"), cex.axis=1.5) # plot y-axis with special labels

quartz()

vit <- viterbi(A,E,statePath[,1],c(0,1))
vitRowMax <- unlist(apply(vit, 1, function(x) names(x)[which.max(x)]))
vitRowMaxFactor = factor(vitRowMax)
plot(x=1:length(vitRowMaxFactor),    # 1 to n
     y=as.numeric(vitRowMaxFactor),  # 1 or 2 based on factor
     type = "l",                     # line plot
     yaxt = "n",                     # do not plot y-axis
     xlab = "Chromosome Location",   # x-axis label
     ylab = "GC vs AT Isochores",    # y-axis label
     col = "magenta",                # line color
     lwd = 2,                        # line width
     cex.lab=1.5                     # make axis labels big
     )
axis(2, at=c(1,2), labels=c("AT","GC"), cex.axis=1.5) # plot y-axis with special labels

statePath <- cbind(statePath,vitRowMax)
dimnames(statePath) <- list(rep("",1000),c("observation","hidden state","predicted state"))
noquote(t(statePath))

1 - (sum(statePath[,2] == statePath[,3]) / nrow(statePath))


######################################################
# Using HMMs for CNV calling
######################################################

source("http://bioconductor.org/biocLite.R")
biocLite("HMMcopy")
vignette("HMMcopy")
library("HMMcopy")

# Loading HTS copy number data
rfile <- system.file("extdata", "normal.wig", package = "HMMcopy")
gfile <- system.file("extdata", "gc.wig", package = "HMMcopy")
mfile <- system.file("extdata", "map.wig", package = "HMMcopy")
normal_reads <- wigsToRangedData(rfile, gfile, mfile)
normal_reads[1000:1010, ]

# Correcting HTS copy number data
normal_copy <- correctReadcount(normal_reads)
normal_copy[1000:1010, ]

# Visualizing the effects of correction
par(cex.main = 0.7, cex.lab = 0.7, cex.axis = 0.7, mar = c(4, 4, 2, 0.5))
plotBias(normal_copy, pch = 20, cex = 0.5)

# Visualizing corrected copy number profiles
par(mar = c(4, 4, 2, 0))
plotCorrection(normal_copy, pch = ".")

#Correcting and visualizing tumour copy number profiles
tfile <- system.file("extdata", "tumour.wig", package = "HMMcopy")
tumour_copy <- correctReadcount(wigsToRangedData(tfile, gfile, mfile))
par(mar = c(4, 4, 2, 0))
plotCorrection(tumour_copy, pch = ".")

# Segmentation and Classification of Copy Number Profiles
tumour_segments <- HMMsegment(tumour_copy)

# Visualizing segments and classified states
par(mfrow = c(1, 1))
par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0))
plotSegments(tumour_copy, tumour_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position")
cols <- stateCols() # 6 default state colours
legend("topleft", c("HOMD", "HETD", "NEUT", "GAIN", "AMPL", "HLAMP"), fill = cols, horiz = TRUE, bty = "n", cex = 0.5)

# Improving segmentation performance
default_param <- HMMsegment(tumour_copy, getparam = TRUE)
default_param

# Reducing the number of segments
longseg_param <- default_param
longseg_param$e <- 0.999999999999999
longseg_param$strength <- 1e30
longseg_segments <- HMMsegment(tumour_copy, longseg_param, verbose = FALSE)

par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0))
plotSegments(tumour_copy, longseg_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position")
legend("topleft", c("HOMD", "HETD", "NEUT", "GAIN", "AMPL", "HLAMP"), fill = cols, horiz = TRUE, bty = "n", cex = 0.5)

# Adjusting copy number state ranges
longseg_segments$mus
longseg_param$mu

par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0))
plotSegments(tumour_copy, longseg_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position")
for(i in 1:nrow(longseg_segments$mus)) {
  abline(h = longseg_segments$mus[i ,ncol(longseg_segments$mus)], col = cols[i], lwd = 2, lty = 3)
}
abline(v = 7.68e7, lwd = 2, lty = 3)
abline(v = 8.02e7, lwd = 2, lty = 3)


newmu_param <- longseg_param
newmu_param$mu <- c(-0.5, -0.4, -0.15, 0.1, 0.4, 0.7)
newmu_segments <- HMMsegment(tumour_copy, newmu_param, verbose = FALSE)
par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0))
plotSegments(tumour_copy, newmu_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position")

# Understanding parameter convergence
newmu_segments$mus
longseg_param$mu

# Overriding parameter convergence
par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0))
newmu_param$m <- newmu_param$mu
realmu_segments <- HMMsegment(tumour_copy, newmu_param, verbose = FALSE)
plotSegments(tumour_copy, realmu_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position")

# Normalizing tumour by normal copy number profiles
somatic_copy <- tumour_copy
# LOGARITHM IDENTITY: log(a) - log(b) == lob(a / b)
somatic_copy$copy <- tumour_copy$copy - normal_copy$copy

somatic_segments <- HMMsegment(somatic_copy, newmu_param, verbose = FALSE)
par(cex.main = 0.5, cex.lab = 0.5, cex.axis = 0.5, mar = c(2, 1.5, 0, 0), mgp = c(1, 0.5, 0))
plotSegments(somatic_copy, somatic_segments, pch = ".", ylab = "Tumour Copy Number", xlab = "Chromosome Position")