To work with small and in particular big data, it is essential to become familiar with the main techniques for speeding up your analysis, so you can reduce computational time and get insights as quickly as possible. This page is based on Writing Efficient R Code DataCamp course by Colin Gillespie.

First, download the following datasets:

The Art of Benchmarking

# Print the R version details using version
version
##                _                           
## platform       x86_64-apple-darwin17.0     
## arch           x86_64                      
## os             darwin17.0                  
## system         x86_64, darwin17.0          
## status                                     
## major          4                           
## minor          1.1                         
## year           2021                        
## month          08                          
## day            10                          
## svn rev        80725                       
## language       R                           
## version.string R version 4.1.1 (2021-08-10)
## nickname       Kick Things
# How long does it take to read movies from CSV?
system.time(csv <- read.csv("movies.csv"))
##    utente   sistema trascorso 
##     0.217     0.015     0.235
# How long does it take to read movies from RDS?
system.time(rds <- readRDS("movies.rds"))
##    utente   sistema trascorso 
##     0.047     0.002     0.050
# Load the package
library(microbenchmark)

# Compare the two functions in sub-millisecond (supposedly nanosecond)
compare <- microbenchmark(read.csv("movies.csv"), 
                          readRDS("movies.rds"), 
                          times = 10)

# Print compare
compare
## Unit: milliseconds
##                    expr       min        lq     mean    median        uq
##  read.csv("movies.csv") 218.32178 220.60399 228.1838 224.57968 228.85688
##   readRDS("movies.rds")  46.26977  49.29614  52.5652  51.61593  57.66378
##        max neval
##  248.69548    10
##   59.83807    10

Fine Tuning: Efficient Base R

# Dynamic or pre-allocated vectors?

growingSlow <- function(n) {
    x <- NULL
    for(i in 1:n) x <- c(x, rnorm(1))
    x
}

growingFast <- function(n) {
    x <- numeric(n) # Pre-allocate
    for(i in 1:n) x[i] <- rnorm(1)
    x
}

n <- 1000
microbenchmark(growingFast(n), growingSlow(n), times = 10)
## Unit: milliseconds
##            expr      min       lq     mean   median       uq       max neval
##  growingFast(n) 1.296160 1.327504 2.112312 1.790882 2.146779  5.345045    10
##  growingSlow(n) 2.544553 4.039229 6.052685 4.607155 6.440413 15.015740    10
# For loops or vectorized operations?

powerSlow = function(n) {
  x <- rnorm(n)
  x2 <- numeric(length(x))
  for(i in 1:n) x2[i] <- x[i] * x[i]
  x2
}

powerFast = function(n) {
  x <- rnorm(n)
  x * x
}

n = 1000
microbenchmark(powerFast(n), powerSlow(n), times = 10)
## Unit: microseconds
##          expr    min     lq     mean  median     uq      max neval
##  powerFast(n) 42.499 43.499 155.0662 44.0445 46.347 1150.343    10
##  powerSlow(n) 89.432 90.677 429.1356 90.9920 92.296 3471.203    10
sumLogSlow = function(n) {
  total <- 0
  x <- runif(n, 1, 100)
  for(i in 1:n) total <- total + log(x[i])
  total
}

sumLogFast = function(n) {
  x <- runif(n, 1, 100)
  sum(log(x))
}

n = 1000
microbenchmark(sumLogFast(n), sumLogSlow(n), times = 10)
## Unit: microseconds
##           expr    min     lq     mean  median     uq      max neval
##  sumLogFast(n) 20.483 20.997 156.1198 21.3800 21.595 1362.390    10
##  sumLogSlow(n) 50.038 50.326 353.6485 50.7835 51.699 3073.641    10
# Data frames or matrices?

# A data frame
df = data.frame(
  a = lubridate::now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1:1e3,
  d = runif(1e3),
  e = sample(letters, 1e3, replace = TRUE)
)

# A matrix
mat = matrix(runif(5e3), ncol = 5)

# Get a column
microbenchmark(mat[, 3], df[, 3])
## Unit: microseconds
##      expr   min     lq    mean median     uq    max neval
##  mat[, 3] 2.261 2.5010 2.77460 2.6555 2.8625  8.175   100
##   df[, 3] 5.242 5.5295 6.76378 5.8495 6.3690 52.303   100
# Get a row
microbenchmark(mat[1, ], df[1, ])
## Unit: nanoseconds
##      expr   min      lq     mean  median      uq    max neval
##  mat[1, ]   384   442.5   688.33   575.5   630.5  13064   100
##   df[1, ] 41157 42146.5 44754.87 42683.5 43431.0 211385   100

Diagnosing Problems: Code Profiling

# Load the profvis package
library(profvis)

movies = readRDS("movies.rds")

# Profile the following code
profvis({
  # Load and select data
  movies <- movies[movies$Comedy == 1, ]

  # Plot data of interest
  plot(movies$year, movies$rating)

  # Loess (local polynomial) regression line
  model <- loess(rating ~ year, data = movies)
  
  # Add fitted line to the plot
  j <- order(movies$year)
  lines(movies$year[j], model$fitted[j], col = "red")
})