##############################################
# Looping in R
##############################################
# ALWAYS, ALWAYS, ALWAYS use braces to denote the blocks of code that is being looped over!!!
# Using braces to denote your code blocks makes your code more readable and less bug prone!

# for(variable in sequence) {
#   statements
# }
for (index in 1:5) {
  print(index)
}

stopCondition = 4
for (index in 1:10) {
  if (index < stopCondition) {
    print(index)
  }
  else {
    stop("values need to be < ",stopCondition)
  }
}

# while(condition) {
#   statements
# }
z <- 0
while(z < 5) {
  z <- z + 2
  print(z)
}

##############################################
# Heatmaps in R
##############################################
library(multtest); data(golub)

# No scaling and no dendrogram (clustering)
heatmap(golub, Rowv = NA, Colv = NA, scale = "none", col=greenred(75))

# Row scaling and no dendrogram (clustering)
heatmap(golub, Rowv = NA, Colv = NA, scale = "row", col=greenred(75))

# No scaling and row dendrogram (clustering)
heatmap(golub, Colv = NA, scale = "none", cexRow=0.1)

# Order rows by decreasing difference between the mean expression between ALL and AML patients
gol.fac <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
mall <- apply(golub[,gol.fac=="ALL"], 1, mean)
maml <- apply(golub[,gol.fac=="AML"], 1, mean)
o <- order(abs(mall-maml), decreasing=TRUE)

# Create a set of biomarkers out of the top 50
biomarkers50 = golub[o[1:50],]
dim(biomarkers50)

# No scaling and no dendrogram (clustering)
heatmap(biomarkers50, Rowv = NA, Colv = NA, scale = "none")

# Row scaling and no dendrogram (clustering)
heatmap(biomarkers50, Rowv = NA, Colv = NA, scale = "row")

# Create a red-green heatmap
heatmap(biomarkers50, Rowv = NA, Colv = NA, scale = "none", col=greenred(75))

# Use heatmap.2 in order to add a legend or a histogram or other fancy stuff
library("gplots")

# No scaling and no dendrogram (clustering)
heatmap.2(biomarkers50, Rowv = NA, Colv = NA, scale = "none", col=greenred(75), dendrogram="none", key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)

# Row scaling and no dendrogram (clustering)
heatmap.2(biomarkers50, Rowv = NA, Colv = NA, scale = "row", col=greenred(75), dendrogram="none", key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)

# Row scaling and row and column dendrograms (clustering)
heatmap.2(biomarkers50, scale = "row", col=greenred(75), dendrogram="both", key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.5)

# Question 1: If we choose the default dendrogram distance function on the top 50 difference-between-the-mean-expression values as our
# classifier, what is our misclassification rate?

# Answer: 3 out of 11 - We misclassify patients 28, 32, 35.

# Create a new set of biomarkers out of the top 100
biomarkers100 = golub[o[1:100],]

# Row scaling and row and column dendrograms (clustering)
heatmap.2(biomarkers100, scale = "row", col=greenred(75), dendrogram="both", key=TRUE, symkey=FALSE, density.info="none", trace="none", cexRow=0.3)


# Question 2: What is our misclassification rate now?

# Question 3: For this data and our primitive classifier, is using 100 markers better than using 50?