###############################################################################
###############################################################################
# Chapter 7
###############################################################################
###############################################################################

########################################
# 7.1 Distance
########################################

# Example 1. Let a = 1 and b = 3. Then, obviously, the distance d(1, 3) = 2....

# Example 2. Suppose that a = (a1, a2) = (1, 1) and b = (b1, b2) = (4, 5)....

# Example 3. To compute the Euclidian distance between two vectors one...
a <- c(1,1); b <- c(4,5)
sqrt(sum((a-b)^2))

# Example 4. Distances between Cyclin gene expressions. By the build-in-...
library(multtest); data(golub)
cyclins <- grep("Cyclin",golub.gnames[,2])
golub.gnames[cyclins,2]
dist.cyclin <- dist(golub[cyclins,],method="euclidian")
distanceMatrix <- as.matrix(dist.cyclin)
rownames(distanceMatrix) <- colnames(distanceMatrix) <- golub.gnames[cyclins,3]
distanceMatrix[1:5,1:5]

# Example 5. Finding the ten closest genes to a given one. After selecting...
library("genefilter"); library("ALL"); data(ALL)
closeto1389_at <- genefinder(ALL, "1389_at", 10, method = "euc")
closeto1389_at[[1]]$indices
round(closeto1389_at[[1]]$dists,1)
featureNames(ALL)[closeto1389_at[[1]]$indices]

########################################
# 7.2 Two types of Cluster Analysis
########################################
library(pvclust)

########################################
# 7.2.1 Single Linkage
########################################

# Example 1. An explanatory example. To illustrate single linkage cluster...
names <- list(c("g1","g2","g3","g4","g5"),c("Coordinate p1 (patient 1 gene expression)","Coordinate p2 (patient 2 gene expression)"))
sl.clus.dat <- matrix(c(1,1,1,1.3,3,2,3,2.4,5,5),ncol = 2, byrow = TRUE,dimnames = names)
plot(sl.clus.dat,
     pch=19,
     col="blue",
     cex=1.4,
     xlim=c(0,6),
     ylim=c(0,6))
text(sl.clus.dat,labels=row.names(sl.clus.dat), pos=4, col="red", cex=1.6)
print(dist(sl.clus.dat,method="euclidian"),digits=3)

sl.out<-hclust(dist(sl.clus.dat,method="euclidian"),method="single")
plot(sl.out,
     lwd=3,
     col="blue",
     ## col.lab = "brown",
     col.axis = "brown",
     ylab="Distance",
     xlab="Clustering of the expression of 5 genes",
     hang=-1,
     main=NA,
     sub=NA,
     axes=FALSE)
axis(side = 2, at = seq(0, 3.5, .5), col = "brown",labels = TRUE, lwd = 4)

# Example 2. Relating data generation processes to cluster trees. It is of...
x<-rnorm(20,0,1)
singlelinkage.out<-hclust(dist(x,method="euclidian"),method="single")
plot(singlelinkage.out,
     lwd=3,
     col="blue",
     col.axis = "brown",
     ylab="Distance",
     xlab="20 genes with normal random distances",
     hang=-1,
     main=NA,
     sub=NA,
     axes=FALSE)
axis(side = 2, at = seq(0, 1.4, .2), col = "brown",labels = TRUE, lwd = 4)

x2 = matrix(rnorm(200,0,1), ncol=20) # 10 samples
fit <- pvclust(x2, method.hclust="average", method.dist="euclidean", nboot=1000)
plot(fit, main="", cex.lab=1.5) # dendogram with p values

x <- c(rnorm(10,0,0.1),rnorm(10,3,0.5),rnorm(10,10,1.0))
sl.out<-hclust(dist(x,method="euclidian"),method="single")
plot(sl.out,
     lwd=3,
     col.axis = "brown",
     col="magenta",
     ylab="Distance",
     xlab="3 clusters of 10 genes each",
     hang=-1,
     main=NA,
     sub=NA,
     axes=FALSE)
axis(side = 2, at = seq(0, 5, 1), col = "brown",labels = TRUE, lwd = 4)

x2 = matrix(c(rnorm(100,0,0.1),rnorm(100,3,0.5),rnorm(100,10,1.0)), ncol=30)
fit <- pvclust(x2, method.hclust="average", method.dist="euclidean", nboot=1000)
plot(fit, main="", cex.lab=1.5) # dendogram with p values

# Example 3. Application to the Golub (1999) data. Recall that the first...
data(golub, package="multtest")
zyxin <- grep("Zyxin",golub.gnames[,2])
ccnd3 <- grep("CCND3",golub.gnames[,2])
clusdata <- data.frame(golub[ccnd3,],golub[zyxin,])
colnames(clusdata)<-c("CCND3 Cyclin D3","Zyxin")
golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
plot(clusdata,
     pch=as.numeric(golubFactor) + 15,
     col=as.numeric(golubFactor) + 1)
legend("topright",
       legend=c("ALL","AML"),
       pch=16:17,
       col=c(2,3))

plot(hclust(dist(clusdata,method="euclidian"),method="single"),
     lwd=3,
     col="blue",
     col.axis = "brown",
     ylab="Distance",
     xlab="Clustering of patients by gene expression",
     hang=-1,
     main=NA,
     sub=NA,
     axes=FALSE)
axis(side = 2, at = seq(0, 1.2, .2), col = "brown",labels = TRUE, lwd = 4)

########################################
# 7.2.2 k-means
########################################

# Example 1. Relating a data generation process to k-means cluster analysis....
data <- rbind(matrix(rnorm(100,0,0.5), ncol = 2),
              matrix(rnorm(100,2,0.5), ncol = 2))
cl <- kmeans(data, 2)
plot(data, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex=2)
initial <- matrix(c(0,0,2,2), nrow = 2, ncol=2, byrow=TRUE)
cl<- kmeans(data, initial, nstart = 10)
n <- 100; nboot<-1000
boot.cl <- matrix(0,nrow=nboot,ncol = 4)

for (i in 1:nboot) {
    dat.star <- data[sample(1:n,replace=TRUE),]
    cl <- kmeans(dat.star, initial, nstart = 10)
    boot.cl[i,] <- c(cl$centers[1,],cl$centers[2,])
}

quantile(boot.cl[,1],c(0.025,0.975))
quantile(boot.cl[,2],c(0.025,0.975))
quantile(boot.cl[,3],c(0.025,0.975))
quantile(boot.cl[,4],c(0.025,0.975))

# Example 2. Application to the Golub (1999) data. In the above we found...
data <- data.frame(golub[ccnd3,],golub[zyxin,])
colnames(data) <- c("CCND3 (Cyclin D3)","Zyxin")
cl <- kmeans(data, 2,nstart = 10)
cl
boot.cl <- matrix(0,nrow=nboot,ncol = 4)

plot(data, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex=2)

for (i in 1:nboot) {
  dat.star <- data[sample(1:nrow(data),replace=TRUE),]
  cl <- kmeans(dat.star, initial, nstart = 10)
  boot.cl[i,] <- c(cl$centers[1,],cl$centers[2,])
}


quantile(boot.cl[,1],c(0.025,0.975))
quantile(boot.cl[,2],c(0.025,0.975))
quantile(boot.cl[,3],c(0.025,0.975))
quantile(boot.cl[,4],c(0.025,0.975))

########################################
# 7.3 The correlation coefficient
########################################

# Example 1. Teaching demonstration. To develop intuition with respect to...
library(TeachingDemos)
run.cor.examp(1000)

# Example 2. Another teaching demonstration. By the function put.points.demo()
put.points.demo()

# Example 3. Application to the Golub (1999) data. We shall illustrate the...
library(multtest); data(golub)
mcm3 <- grep("MCM3",golub.gnames[,2])
mcm3
golub.gnames[mcm3,2]
x <- golub[mcm3[1],]; y <- golub[mcm3[2],]
cor(x,y)
plot(x,y)
cor.test(x,y)

# Example 4. Confidence interval by the bootstrap. Another method to con-...
nboot <- 1000; boot.cor <- matrix(0,nrow=nboot,ncol = 1)
data <- matrix(c(x,y),ncol=2,byrow=FALSE)

for (i in 1:nboot) {
    dat.star <- data[sample(1:nrow(data),replace=TRUE),]
    boot.cor[i,] <- cor(dat.star)[2,1]}

mean(boot.cor)
quantile(boot.cor[,1],c(0.025,0.975))

# Example 5. Application to the Golub (1999) data. The ALL and AML...
library(multtest); data(golub)
corgol <- apply(golub, 1, function(x) cor(x,golub.cl))
o <- order(corgol)
golub.gnames[o[3041:3051],2]

########################################
# 7.4 Principal Components Analysis
########################################

V <- matrix(c(2,1,2.8,2.6),nrow=2, byrow=TRUE)
plot(V,xlim = c(0,3),ylim = c(0,3), col="black", pch=19, cex=1.5, xlab="Variable 1", ylab="Variable 2")
arrows(0,0,2,1, col="red", lwd=2)
arrows(0,0,2.8,2.6, col="blue", lwd=3)

Z <- matrix(c( 1.63, 1.22, -0.40, 0.79, 0.93, 0.97, -1.38, -1.08, -0.17, -0.96, -0.61, -0.93), nrow=6, byrow=TRUE)
K <- eigen(cor(Z))
X<-matrix(c(1.45, 1.09, -0.44, 0.7, 0.8, 0.86, -1.35, -0.99, -0.23, -0.88, -0.64, -0.85), nrow=6, byrow=TRUE)
for(j in 1:2) Z[,j] = (X[,j] - mean(X[,j]))/(0.912871*sd(X[,j]))
print(Z,digits=3)
pca <- princomp(Z, center = TRUE, cor=TRUE, scores=TRUE)
pca$scores
Z %*% K$vec
C <- array(dim=c(6,2))
C[,1] <- pca$scores[,1] * sin(pi/4)
C[,2] <- pca$scores[,1] * cos(pi/4)
plot(Z,xlim = c(-2,2),ylim = c(-2,2), col="blue", pch=19, cex=1.5, xlab="Variable 1", ylab="Variable 2")
arrows(-2,-2,2,2, col="green", lwd=3)
segments(Z[1,1],Z[1,2],C[1,1],C[1,2], col="red", lwd=2)
segments(Z[2,1],Z[2,2],C[2,1],C[2,2], col="red", lwd=2)
segments(Z[3,1],Z[3,2],C[3,1],C[3,2], col="red", lwd=2)
segments(Z[4,1],Z[4,2],C[4,1],C[4,2], col="red", lwd=2)
segments(Z[5,1],Z[5,2],C[5,1],C[5,2], col="red", lwd=2)
segments(Z[6,1],Z[6,2],C[6,1],C[6,2], col="red", lwd=2)

# Example 1. Using R on the above data. It is convenient to store the data of...
V1 <- c( 1.63, -0.40, 0.93, -1.38, -0.17, -0.61)
V2 <- c( 1.22, 0.79, 0.97, -1.08, -0.96, -0.93)
cor.test(V1,V2)

plot(V1,V2, xlim=c(-2,2), ylim=c(-2,2), col="red", pch=15)

Z <- matrix(c( V1, V2), nrow=6, byrow=FALSE)
cor(Z)

K <- eigen(cor(Z))
print(K,digits=2)

Z %*% K$vec[,1]

print(Z %*% K$vec, digits=2)

pca <- princomp(Z, center = TRUE, cor=TRUE, scores=TRUE)
pca$scores

# Example 2. Application to the Golub (1999) data. The first five eigenvalues...
eigen(cor(golub))$values[1:5]

data <- golub; p <- ncol(data); n <- nrow(data) ; nboot<-1000
eigenvalues <- array(dim=c(nboot,p))

for (i in 1:nboot) {
    dat.star <- data[sample(1:n,replace=TRUE),]
    eigenvalues[i,] <- eigen(cor(dat.star))$values
}

for (j in 1:p) {
    print(quantile(eigenvalues[,j],c(0.025,0.975)))
}

for (j in 1:5) {
    cat(j,as.numeric(quantile(eigenvalues[,j],c(0.025,0.975))),"\n" )
}

sum(eigen(cor(golub))$values[1:2])/38*100

pca <- princomp(golub, center = TRUE, cor=TRUE, scores=TRUE)
o <- order(pca$scores[,2])
golub.gnames[o[1:10],2]
golub.gnames[o[3041:3051],2]

# Example 3. Biplot. A useful manner to plot both genes (cases) and patients...
biplot(princomp(data,cor=TRUE),pc.biplot=TRUE,cex=0.5,expand=0.8, xlab="Component 1", ylab="Component 2")

# Example 4. Critical for S-phase. Golub et al. (1999) mention that among...
data(golub, package = "multtest")
golubFactor <- factor(golub.cl)
o1 <- grep("CD",golub.gnames[,2])
o2 <- grep("Op",golub.gnames[,2])
o3 <- grep("MCM",golub.gnames[,2])
o <- c(o1,o2,o3)
length(o)

pt <- apply(golub, 1, function(x) t.test(x ~ golubFactor)$p.value)
oo <- o[pt[o]<0.01]

Z <- as.matrix(scale(golub, center = TRUE, scale = TRUE))
K <- eigen(cor(Z))
P <- Z %*% -K$vec[,1:2]
leu <- data.frame(P[oo,], row.names= oo)

plot(leu,xlim=c(-10,15), ylim=c(-10,10), pch=19, cex=1.2, xlab="Principal Component 1", ylab="Principal Component 2", col="darkgreen")
text(x = leu$X1, y=leu$X2, labels=rownames(leu), pos = 1, col="blue")
fac <- as.integer(oo %in% o1) + 2 * as.integer(oo %in% o2) + 3 * as.integer(oo %in% o3)
text(x = leu$X1, y=leu$X2, labels=fac, pos = 3, col="red")

cl <- hclust(dist(leu,method="euclidian"),method="single")
plot(cl,
     lwd=3,
     col="blue",
     col.axis = "brown",
     ylab="Distance",
     xlab="Clustering of the expression of genes",
     hang=-1,
     main=NA,
     sub=NA,
     axes=FALSE)
axis(side = 2, at = seq(0, 5, 1), col = "brown",labels = TRUE, lwd = 4)

a <- as.integer(rownames(leu)[cl$order])
for (i in 1:length(a)) {
    cat(a[i],golub.gnames[a[i],2],"\n")
}