###############################################################################
###############################################################################
# Chapter 1
###############################################################################
###############################################################################

########################################
# 1.1 Getting R Started on your PC
########################################
install.packages(c("TeachingDemos"),repo="http://cran.r-project.org", dep=TRUE)
install.packages("BurStMisc")

library("BurStMisc")
library(TeachingDemos)
plot(dice(12,1))

source("http://www.bioconductor.org/biocLite.R")
biocLite()    # install or update your Bioconductor packages

biocLite("ALL")   # do the install only once!
library(ALL)      # load the "ALL" library into the current R session
data(ALL)         # make the data stored in the libray available for outside access

biocLite("multtest")   # do the install only once!
library(multtest)
data(golub)

class(golub)
str(golub)
ls()

########################################
# 1.2 Getting help
########################################
library(help="stats")
?sum
apropos("diff")
example(boxplot)
help.start()

########################################
# 1.3 Calculating with R
########################################
2+3
exp(1)
sum(1:5)
prod(1:5)

########################################
# 1.4 Generating a sequence and a factor
########################################
1:5
seq(0,1,0.1)
factor <- gl(3,5)
factor

############################################################################################################
# Assigning variables in R
############################################################################################################
x <- 4 ; y = 3 # two ways to assign values
x <- y <- 7    # multiple chained assigning

############################################################################################################
# 1.5 R is a vectorized language - A vector is the most basic datatype for data storage in R.
# It stores one or more values of the same type.
############################################################################################################
x = c(11, 12, 13, 14) ; x = 11:14
x
x+100 # R functions are vectorized
x^2   # square function is vectorized too

############################################################################################################
# Brackets are used for subsetting, returns a vector with potentially a subset of elements
############################################################################################################
x[4] # Accessing the fourth element


############################################################################################################
# "everything is a vector" (or made of vectors)
############################################################################################################
x = 22
x
x[1]
x[1][1]
x[1][1][1]
x[1][1][1][1]


############################################################################################################
# all native funtions in R are "vectorized"
############################################################################################################
x = 11:14
x>12
x[x>12]
x[1:3]
is.na(x)
cumsum(x)

############################################################################################################
# classes of different vectors
############################################################################################################
x <- c("DNA", "RNA", "Protein"); class(x)
x <- c(1.2, 1.5); class(x)
x <- 1:4; class(x)
x <- x>12; class(x)

########################################
# 1.6 Computing on a data vector
########################################
gene1 <- c(1.00,1.50,1.25)
gene1

sum(gene1)
mean(gene1)
sum(gene1)/3
sd(gene1)
sqrt(sum((gene1-mean(gene1))^2)/2)
summary(gene1)

############################################################################################################
# Getting rid of NAs
############################################################################################################
x <- c("DNA", "RNA", "Protein") # Vector of strings
x
x[2] <- NA ## Replace the second element of the vector
x
y = x[!is.na(x)]

########################################
# 1.7 Constructing a data matrix
########################################
gene2 <- c(1.35,1.55,1.00)
gene3 <- c(-1.10,-1.50,-1.25)
gene4 <- c(-1.20,-1.30,-1.00)
rowColNames <- list(c("gene1","gene2","gene3","gene4"),
                    c("Eric","Peter","Anna"))
geneData <- matrix(c(gene1,gene2,gene3,gene4), nrow=4, ncol=3,
                 byrow=TRUE, dimnames = rowColNames)
geneData
geneData[1,2]
geneData[1,]
geneData[,2]

system("bash -c \"mkdir ~/biology664\"")
write.table(geneData,file="~/biology664/geneData.Rdata")
geneDataRead <- read.table("~/biology664/geneData.Rdata")
geneDataRead

########################################
# 1.8 Computing on a data matrix
########################################
apply(geneData,2,mean)
apply(geneData,1,mean)

meanExpressions <- apply(geneData,1,mean)
o <- order(meanExpressions,decreasing=TRUE)
o

geneData[o,]
geneData[c(1,2),]
geneData[c("gene1","gene2"),]

meanExpressions > 0
geneData[meanExpressions > 0,]

########################################
# 1.9 Application to the Golub (1999) data
########################################
library(multtest); data(golub)

class(golub.cl)
length(golub.cl)
golub.cl

class(golub.gnames)
dim(golub.gnames)
head(golub.gnames)

class(golub)
nrow(golub)
ncol(golub)
dim(golub)

golub.gnames[1042,]
golub[1042,2]

golub[,1] # patient 1 gene expression profile
golub[1042,]

golub[1042,1:27]

golubFactor <- factor(golub.cl, levels=0:1, labels = c("ALL","AML"))
golubFactor=="ALL"
golub[1042,golubFactor=="ALL"]

meanALL <- apply(golub[,golubFactor=="ALL"], 1, mean)
head(meanALL)

cd33 = grep("CD33",golub.gnames[,2], ignore.case = TRUE)
cd33
golub[cd33,]
golub.gnames[cd33,]

############################################################################################################
# 1.10 Constructing a data.frame - A data.frame is used for storing data tables (like a matrix), but where different columns can contain
# different datatypes. A data.frame consists of a list of column vectors of equal length, where the data in each column
# must be of the same type.
############################################################################################################
patients.df <- data.frame(    # Define the 3 column names when creating the data.frame
  patientID = c("101", "102", "103", "104"),
  treatment = c("drug", "placebo", "drug", "placebo"),
  age = c(20, 30, 24, 22)
)

patients.df <- data.frame(    # Define the 3 column names after creating the data.frame
  c("101", "102", "103", "104"),
  c("drug", "placebo", "drug", "placebo"),
  c(20, 30, 24, 22)
)
colnames(patients.df) <- c("patientID", "treatment", "age")

patients.df
nrow(patients.df)
ncol(patients.df)
head(patients.df)

############################################################################################################
# 1.11 Referencing column vectors in a data.frame
############################################################################################################
patients.df[[2]]           # a column vector using hardcoded member referencing (double brackets)
patients.df[["treatment"]] # a named column vector using named referencing method 1 (double brackets)
patients.df$treatment      # a named column vector using named referencing method 2 - THE PREFERRED METHOD
patients.df[,2]            # a column vector specifying all rows and column 2 (hardcoded)

############################################################################################################
# 1.12 Row and column subsetting (slicing) on a data.frame (returns a dataframe with a subset of the rows)
############################################################################################################
patients.df[c(1,3),]                         # a hardcoded row slice or subset containing rows 1, 2, and 3
patients.df[patients.df$treatment=="drug",] # a named row slice or subset using logical indexing  - THE PREFERRED METHOD

treatmentIsDrug = patients.df$treatment=="drug"
treatmentIsDrug
patients.df[treatmentIsDrug,]                # a row slice or subset using logical indexing

subset(patients.df, treatment=="drug")       # a named row slice or subset using the subset function

row.names(patients.df) = patients.df$patientID
patients.df
patients.df[c("101", "103"),]                # a row slice or subset using name indexing

############################################################################################################
# column slice or subset (returns a dataframe with a subset of the columns)
############################################################################################################
patients.df[2]                     # a hardcoded column 2 slice or subset
patients.df["treatment"]           # a named column slice or subset  - THE PREFERRED METHOD
patients.df[c("treatment", "age")] # a named columns slice or subset - THE PREFERRED METHOD


############################################################################################################
# 1.13 Adding columns and rows to a data.frame
############################################################################################################
patients.df2 = cbind(patients.df, weight=c(160, 114, 210, 102))
patients.df2

patients.df3 = cbind(patients.df2, gender=c("male", "female", "male", "female"))
patients.df3

############################################################################################################
# 1.13 Adding rows to a data.frame
############################################################################################################
patients.df1 <- data.frame(    # Define the 3 column names when creating the data.frame
  patientID = c("101", "102", "103", "104"),
  treatment = c("drug", "placebo", "drug", "placebo"),
  age = c(20, 30, 24, 22)
)
patients.df2 <- data.frame(    # Define the 3 column names when creating the data.frame
  patientID = c("97", "98", "99"),
  treatment = c("drug", "placebo", "drug"),
  age = c(24, 31, 42)
)
pateints.df1and2 = rbind(patients.df1, patients.df2)
pateints.df1and2

############################################################################################################
# 1.14 Merging two data.frames
############################################################################################################
A <- data.frame(    # Define the 3 column names when creating the data.frame
  patientID = c("101", "102", "103", "104"),
  treatment = c("drug", "placebo", "drug", "placebo"),
  age = c(20, 30, 24, 22)
)
B <- data.frame(    # Define the 3 column names when creating the data.frame
  patientID = c("101", "102", "105", "106"),
  gender = c("male", "female", "male", "female"),
  weight = c(160, 114, 224, 130)
)

innerJoin = merge(x=A, y=B, by="patientID", all=FALSE)
innerJoin

leftOuterJoin = merge(x=A, y=B, by="patientID", all.x=TRUE)
leftOuterJoin

outerJoin = merge(x=A, y=B, by="patientID", all=TRUE)
outerJoin

############################################################################################################
# 1.15 Constructing a list
# A list is a generic vector containing other objects. It is the most generic (and confusing) datatype in R.
# A list can contain any data structure inside it, including other lists.
############################################################################################################
list1 <- list(
  c("p53", "p63", "p73"),
  matrix(1:10, nrow=2),
  c(TRUE, FALSE, TRUE, FALSE, FALSE)
)
list1 # without named elements

# The better way is to define the elements
p53FamilyGenes = c("p53", "p63", "p73")
list2 <- list(
  genes = p53FamilyGenes,
  matrix1 = matrix(1:10, nrow=2),
  remission = c(TRUE, FALSE, TRUE, FALSE, FALSE)
)
list2 # with named columns

############################################################################################################
# 1.16 Subsetting (slicing) of a list (returns a list with a subset of the elements) using single brackets
############################################################################################################
list1[1]                         # hardcoded sublist containing element 1
list1[1:2]                       # hardcoded sublist containing elements 1 and 2
list2["genes"]                   # sublist containing the element named "genes" - THE PREFERRED METHOD
list2[c("genes", "remission")]   # sublist containing the elements named "genes" and "remission" - THE PREFERRED METHOD

############################################################################################################
# list member referencing using double brackets
############################################################################################################
list2[[1]]             # Retrieving element 1 (hardcoded)
list2[[1]][1] = "her2" # re-assigning within element 1 (harcoded)
list2[[1]]             # Retrieving element 1 (hardcoded)
p53FamilyGenes         # the original variable p53FamilyGenes is unaffected!

list2[["genes"]]       # named referencing method 1
list2$genes            # named referencing method 2 - THE PREFERRED METHOD

############################################################################################################
# 1.17 Search Path Attachment
############################################################################################################
attach(list2)
genes
detach(list2)

attach(patients.df)
treatment
detach(patients.df)

############################################################################################################
# Beware of masking!!!!!
############################################################################################################
genes = c("p53", "p63", "p73")
attach(list2)
remission
genes # PROBLEM!!: remission is what we expect, but not genes!! genes has been "masked" by the global value
detach(list2) # always detach at the end of your R script!


############################################################################################################
# 1.18 R is pass-by-value
############################################################################################################
> x = 10:20     # x is assigned to a vector of integers from 10 to 20
> x
> x * 100       # functions applied to x do not chaange the values of x
> x
> y = x         # assign y to a copy of the vector referenced by x
> x = x * 100   # re-assign x to x * 100
> y             # however, re-assiging x has not changed the variable y

########################################
# 1.19 Looping in R
########################################
for (index in 1:5) {
  print(index)
}

stopCondition = 4
for (index in 1:10) {
  if (index < stopCondition) {
    print(index)
  }
  else {
    break  # break-out of the "for loop"
  }
}

z <- 0
while(z < 5) {
  z <- z + 2
  print(z)
}

########################################
# 1.20 Running scripts
########################################
library(multtest); data(golub)
golubFactor <- factor(golub.cl,levels=0:1, labels= c("ALL","AML"))
meanAll <- apply(golub[,golubFactor=="ALL"], 1, mean)
meanAml <- apply(golub[,golubFactor=="AML"], 1, mean)
o <- order(abs(meanAll-meanAml), decreasing=TRUE)
print(golub.gnames[o[1:5],2])

########################################
# Save the above R commands to ~/biology664/meanDiff.R
########################################
source("~/biology664/meanDiff.R") # forward slashes used on UNIX and Mac OSs