Why Using R?

R is a programming language designed for statistical computing and graphics. Unlike a compiled language, such as C and Fortran, R is typically used as an interpreted language. There are several reasons why I choose R here.

To get started, one needs to download and install R and RStudio from the following websites: CRAN and RStudio. I recommend R version 3.6.3 for this tutorial.

From the R console, type the following command to get help.

help(runif)     # show documentation for the topic -- runif in this example

A crash course of R

I will not provide a detailed R tutorial, as they are already available. I recommend the following resources for learning R.

Here, I will provide a brief overview of R syntax and highlight a few R’s unique features.

Typical data types & math operations

a = 10 + 3.14    # integer, real number; assignment to a
a*a # a multiplies a
## [1] 172.6596
a**2 # square of a, which gives the same value as the previous one
## [1] 172.6596
b = 3.0 + 4.0i  # complex number; assignment to b
Mod(b) # The modulus of b
## [1] 5
d = (3 + 4 == 7)   # logical; d is TRUE
!d   # not d, which is FALSE
## [1] FALSE
e = "Monday"  # character
class(e)  # show the data type of e
## [1] "character"
nchar(e)  # number of characters in a string
## [1] 6

Vectors

vec = c("Man","Woman","Woman","Man","Woman")   # a vector of characters
length(vec) #length of a vector
## [1] 5
vec == "Woman" # logic values, compare each element to "Woman"
## [1] FALSE  TRUE  TRUE FALSE  TRUE
which(vec == "Woman") # identify the indices of the elements being "Woman"
## [1] 2 3 5

Factors

Factor type is an important data type in R. It is commonly used to represent categorical data.

vec_factor = as.factor(vec)   # convert the character vector to a factor vector
class(vec);class(vec_factor)  # check the data types
## [1] "character"
## [1] "factor"
levels(vec_factor); nlevels(vec_factor)  # levels and the number of levels
## [1] "Man"   "Woman"
## [1] 2

If statement

a = 0
if(a == 1){   # conditions
  print("a equals to 1")
} else {
  print("a is not 1")
}
## [1] "a is not 1"

For loops

for (i in 1:5) print(i)   # version 1
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
for (i in seq_len(5)) print(i)   # version 2
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
v = c(1,2,5,7)
for (i in v){
  print(i)   # version 3
}
## [1] 1
## [1] 2
## [1] 5
## [1] 7

While statement

i = 0
while(i < 5){
  i = i + 2
  print(i)
}
## [1] 2
## [1] 4
## [1] 6

Apply

Another unique feature in R. Apply performs a function iteratively through an array or matrix. It’s usually more efficient than a for-loop. Syntax: apply(X, MARGIN, FUN). Here, X is an array or matrix. MARGIN=1 performs the specified function by rows, MARGIN=2 by columns. FUN is the function to apply.

mat = matrix(rnorm(16), nrow = 4)    # generate a random matrix of 4 x 4
mat
##             [,1]       [,2]       [,3]         [,4]
## [1,]  1.38854630 0.01348135  0.6930500 -0.363760364
## [2,]  0.09646337 1.12088503 -0.6476840  0.927960412
## [3,] -0.33320608 0.14337149  0.2026408  0.286330142
## [4,] -0.92728526 1.74010565 -0.6932591  0.004327818
apply(mat, 1, sum)   # row sum
## [1] 1.7313173 1.4976248 0.2991364 0.1238891
rowSums(mat)   # An alternative way to do row sum
## [1] 1.7313173 1.4976248 0.2991364 0.1238891
apply(mat, 2, sum)   # column sum
## [1]  0.2245183  3.0178435 -0.4452522  0.8548580
colSums(mat)  # An alternative way to do column sum
## [1]  0.2245183  3.0178435 -0.4452522  0.8548580

There are other apply-like functions, such as lapply, sapply, tapply, etc. Check here for more details.

Functions

myfunction <- function(x) {    # a simple function to perform x square
  return(x*x)
}
b = myfunction(4)
print(paste0("b = ",b))
## [1] "b = 16"

List

Lists can contain elements of different types.

my_list = list("a", c(1,2,3), FALSE, 3.14)  # define a new list
my_list[[1]]   # the first element
## [1] "a"
my_list[[2]]   # the second element
## [1] 1 2 3
names(my_list) = c("letters", "array", "TF", "pi")  # name list elements
my_list$TF   # retrieve elements by names
## [1] FALSE
my_vector = unlist(my_list)   # convert the list to a vector
my_vector
## letters  array1  array2  array3      TF      pi 
##     "a"     "1"     "2"     "3" "FALSE"  "3.14"

Data frame

Data frame is widely used in R. It’s similar to a matrix but allows mixed data types in it.

my_data = data.frame(
          id = c(1:5),
          atom = c("N","CA","CB","C","O"),
          mass = c(14, 12, 12, 12, 16),
          size = c(1.2, 1.4, 1.4, 1.4, 1.1),
          stringsAsFactors = F
)
print(my_data)
##   id atom mass size
## 1  1    N   14  1.2
## 2  2   CA   12  1.4
## 3  3   CB   12  1.4
## 4  4    C   12  1.4
## 5  5    O   16  1.1
colnames(my_data)
## [1] "id"   "atom" "mass" "size"
str(my_data)      # get the structure 
## 'data.frame':    5 obs. of  4 variables:
##  $ id  : int  1 2 3 4 5
##  $ atom: chr  "N" "CA" "CB" "C" ...
##  $ mass: num  14 12 12 12 16
##  $ size: num  1.2 1.4 1.4 1.4 1.1
my_data$mass      # retrieve a column
## [1] 14 12 12 12 16
my_data$backbone = c(T, T, F, T, T)   # add a column
print(my_data)
##   id atom mass size backbone
## 1  1    N   14  1.2     TRUE
## 2  2   CA   12  1.4     TRUE
## 3  3   CB   12  1.4    FALSE
## 4  4    C   12  1.4     TRUE
## 5  5    O   16  1.1     TRUE
my_data = rbind(my_data, c(6, "S", 32, 1.6, F))   # add a row
print(my_data)
##   id atom mass size backbone
## 1  1    N   14  1.2     TRUE
## 2  2   CA   12  1.4     TRUE
## 3  3   CB   12  1.4    FALSE
## 4  4    C   12  1.4     TRUE
## 5  5    O   16  1.1     TRUE
## 6  6    S   32  1.6    FALSE

Input/output

save(my_data,file = "extra/data/01A/my_data.Rdata")    # save the data frame my_data to file
saveRDS(my_data, file = "extra/data/01A/my_data.RDS")  # save my_data to an RDS file
load("extra/data/01A/my_data.Rdata")   # read data from an Rdata file; my_data, if exist, would be overwritten
my_data2 = readRDS("extra/data/01A/my_data.RDS")   # read data from an RDS file, assign it to another variable

Package

The following code works to install/remove packages

if(! "umap" %in% rownames(installed.packages())) {      # check if the package exists
  install.packages("umap")   # if not, install "umap"
}
library("umap")    # load the umap library
help(umap)
remove.packages("umap")    # remove it ...

Basic plotting

x = seq_len(5)
y = x**2
plot(x, y)       # plot points

curve(x**2)      # plot f(x) = x*x

y = data.frame(xvalues = rnorm(1000), yvalues = rnorm(1000))    # 1000 random points in 2D
plot(y$xvalues, y$yvalues)     # scatter plot
library(ggplot2)

ggplot(y, aes(x = xvalues, y = yvalues)) +    # density map
  stat_density2d(aes(fill = ..density..), geom = "raster", contour = FALSE)

A few notes

The previous section highlights the basic usage of R. Please refer to “An Introduction to R” and R & Bioconductor Manual for more details. In the following, I will emphasize a few R features that new learners may overlook or be confused about.

Factor vs. string

Factors may look similar to Strings, but they are very different in R scripting. What makes things worse is that, in R versions before 4.0, stringsAsFactors = TRUE by default. That means, for R functionality using the parameter stringsAsFactors, any string would be interpreted as a factor . However, in R versions 4.0 or later, the default stringsAsFactors = FALSE. If a package is developed for R before v4.0 and by assuming default treatment of strings (as factors), the same package may not work properly for R v.4.0 or later. A good practice for R programming is (1) Use R version 3.6.3 if many legacy R packages are required; (2) always specify how string would be treated. Here is an example to create a data frame containing characters/strings.

my_data = data.frame(
          id = c(1:5),
          atom = c("N","CA","CB","C","O"),
          mass = c(14, 12, 12, 12, 16),
          size = c(1.2, 1.4, 1.4, 1.4, 1.1)
)
print(my_data)
##   id atom mass size
## 1  1    N   14  1.2
## 2  2   CA   12  1.4
## 3  3   CB   12  1.4
## 4  4    C   12  1.4
## 5  5    O   16  1.1
class(my_data$atom)   # factor if R version < 4.0; character if R version >=4.0
## [1] "factor"
my_data = data.frame(
          id = c(1:5),
          atom = c("N","CA","CB","C","O"),
          mass = c(14, 12, 12, 12, 16),
          size = c(1.2, 1.4, 1.4, 1.4, 1.1),
          stringsAsFactors = F
)

class(my_data$atom)  # always character type
## [1] "character"

Here is an example to read a table from a file.

write.table(my_data, file = "extra/data/01A/my_data.txt", quote = F, sep="\t", row.names = F) #The table is saved to a file
data_read = read.table("extra/data/01A/my_data.txt", header = T, sep = "\t")   #Read the table from the saved file
class(data_read$atom)   # factor if R version < 4.0; character if R version >=4.0
## [1] "factor"
data_read = read.table("extra/data/01A/my_data.txt", header = T, sep = "\t", as.is = T)   # as.is = T reads strings as characters
class(data_read$atom)
## [1] "character"

Data frame vs. matrix

Matrix is more suitable for linear algebra analysis; while data frame is good to save heterogeneous data and ideal for statistics analysis, plotting, etc. Sometimes, data can be saved as either a data frame or a matrix. It is important to be aware of the data type, so that appropriate functions/operators are applied. Refer to the following article for the comparison of matrix and data frame. Basic plotting shows an example of plotting with data frame. Here is an example of linear regression with data frame.

plot(my_data$mass, my_data$size, xlab = "Size", ylab = "Mass")    # size vs. mass in my_data
model = lm(size~mass, data = my_data)   # linear regression
abline(model)   # add the fitted line

summary(model)   # summary of the linear model
## 
## Call:
## lm(formula = size ~ mass, data = my_data)
## 
## Residuals:
##        1        2        3        4        5 
## -0.03750  0.00625  0.00625  0.00625  0.01875 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.331250   0.092913   25.09 0.000139 ***
## mass        -0.078125   0.006988  -11.18 0.001534 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.025 on 3 degrees of freedom
## Multiple R-squared:  0.9766, Adjusted R-squared:  0.9687 
## F-statistic:   125 on 1 and 3 DF,  p-value: 0.001534

List vs. vector

Lists are recursive type of vectors – lists can have values of different types. A data frame is a list where column elements have the same data type. In the above-mentioned example of linear regression, model is a list containing results of the linear model.

str(model)   # show the structure of the list
## List of 12
##  $ coefficients : Named num [1:2] 2.3312 -0.0781
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "mass"
##  $ residuals    : Named num [1:5] -0.0375 0.00625 0.00625 0.00625 0.01875
##   ..- attr(*, "names")= chr [1:5] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:5] -2.9069 -0.2795 0.0127 0.0127 0.0394
##   ..- attr(*, "names")= chr [1:5] "(Intercept)" "mass" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:5] 1.24 1.39 1.39 1.39 1.08
##   ..- attr(*, "names")= chr [1:5] "1" "2" "3" "4" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:5, 1:2] -2.236 0.447 0.447 0.447 0.447 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:5] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "mass"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.45 1.4
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 3
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = size ~ mass, data = my_data)
##  $ terms        :Classes 'terms', 'formula'  language size ~ mass
##   .. ..- attr(*, "variables")= language list(size, mass)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "size" "mass"
##   .. .. .. ..$ : chr "mass"
##   .. ..- attr(*, "term.labels")= chr "mass"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(size, mass)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "size" "mass"
##  $ model        :'data.frame':   5 obs. of  2 variables:
##   ..$ size: num [1:5] 1.2 1.4 1.4 1.4 1.1
##   ..$ mass: num [1:5] 14 12 12 12 16
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language size ~ mass
##   .. .. ..- attr(*, "variables")= language list(size, mass)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "size" "mass"
##   .. .. .. .. ..$ : chr "mass"
##   .. .. ..- attr(*, "term.labels")= chr "mass"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(size, mass)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "size" "mass"
##  - attr(*, "class")= chr "lm"

Below, mean value is computed for each column of the matrix using lapply.

means = lapply(mat, mean)    # this doesn't work
length(means)
## [1] 16
mat_list =  lapply(seq_len(ncol(mat)), function(i) mat[,i])  # convert to a list by column vectors
means = lapply(mat_list,mean)  # calculate means for a list
means
## [[1]]
## [1] 0.05612958
## 
## [[2]]
## [1] 0.7544609
## 
## [[3]]
## [1] -0.1113131
## 
## [[4]]
## [1] 0.2137145
means2 = colMeans(mat)   # an alternative way to compute column means
means2
## [1]  0.05612958  0.75446088 -0.11131305  0.21371450

Usage of ellipsis in R functions

When defining an R function, you may use ellipsis (i.e., …) to pass any argument to the function. This is especially useful when the function calls another function with varying number of arguments. For example,

## a function without using ellipsis
func_main <- function(a, b, c){
  return(a + b + c)
}

func_main(a = 1, b = 2.4, c = 0.6)
## [1] 4
## a function where an argument of the function is the name of another function, func_2nd
## ellipsis ... is used to pass arguments for func_2nd
func_main2 <- function(a, func_2nd, ...){
  return(a + func_2nd(...))
}

## Usage case 1
func1 <- function(d, e){
  return(d + e)
}
func_main2(a = 1, func_2nd = func1, d = 2.4, e = 0.6)
## [1] 4
## Usage case 2
func2 <- function(f, g, h, i){
  return(f+g+h*i)
}
func_main2(a = 1, func_2nd = func2, f = 10, g = 20, h = 1, i = -3)
## [1] 28
## Wrong usages
#func_main2(a = 1, func_2nd = func2, f = 10, g = 20, h = 1) 
#func_main2(a = 1, func_2nd = func2, d = 2.4, e = 0.6) 

Make sure that all arguments passed through ellipsis are the same whenever ellipsis is used in the body of the function. Thus, in the usage case 2 above, the code doesn’t work when only f, g, h are passed to func2. It doesn’t work either when, for example, d and e are passed to func2.