Writing Efficient R Code

The beauty of R is that it is built for performing data analysis. The downside is that sometimes R can be slow, thereby obstructing our analysis. For this reason, it is essential to become familiar with the main techniques for speeding up your analysis, so you can reduce computational time and get insights as quickly as possible.

This page is based on Writing Efficient R Code DataCamp course by Colin Gillespie.

Datasets:

The Art of Benchmarking

# Print the R version details using version
version

##                _                           
## platform       x86_64-apple-darwin15.6.0   
## arch           x86_64                      
## os             darwin15.6.0                
## system         x86_64, darwin15.6.0        
## status                                     
## major          3                           
## minor          5.1                         
## year           2018                        
## month          07                          
## day            02                          
## svn rev        74947                       
## language       R                           
## version.string R version 3.5.1 (2018-07-02)
## nickname       Feather Spray

# How long does it take to read movies from CSV?
system.time(csv <- read.csv("movies.csv"))

##    user  system elapsed 
##   0.416   0.019   0.438

# How long does it take to read movies from RDS?
system.time(rds <- readRDS("movies.rds"))

##    user  system elapsed 
##   0.052   0.002   0.055

# Load the package
library(microbenchmark)

# Compare the two functions
compare <- microbenchmark(read.csv("movies.csv"), 
                          readRDS("movies.rds"), 
                          times = 10)

# Print compare
compare

## Unit: milliseconds
##                    expr       min        lq      mean    median        uq
##  read.csv("movies.csv") 408.35552 412.06844 457.26387 431.41877 462.35406
##   readRDS("movies.rds")  48.03375  48.61207  59.06533  51.20348  72.76131
##        max neval
##  666.12105    10
##   83.55275    10

Fine Tuning: Efficient Base R

# Dynamic or pre-allocated vectors?

growingSlow <- function(n) {
    x <- NULL
    for(i in 1:n) x <- c(x, rnorm(1))
    x
}

growingFast <- function(n) {
    x <- numeric(n) # Pre-allocate
    for(i in 1:n) x[i] <- rnorm(1)
    x
}

n <- 1000
microbenchmark(growingFast(n), growingSlow(n), times = 10)

## Unit: milliseconds
##            expr      min       lq     mean   median       uq       max
##  growingFast(n) 1.294030 1.926876 2.382776 2.309296 2.659062  4.331309
##  growingSlow(n) 2.622895 3.346198 6.487071 4.531758 5.344352 19.671067
##  neval
##     10
##     10

# For loops or vectorized operations?

powerSlow = function(n) {
  x <- rnorm(n)
  x2 <- numeric(length(x))
  for(i in 1:n) x2[i] <- x[i] * x[i]
  x2
}

powerFast = function(n) {
  x <- rnorm(n)
  x * x
}

n = 1000
microbenchmark(powerFast(n), powerSlow(n), times = 10)

## Unit: microseconds
##          expr     min      lq     mean  median      uq      max neval
##  powerFast(n)  57.834  58.240 173.0290  59.072  60.089 1189.470    10
##  powerSlow(n) 128.467 129.456 401.1015 130.195 131.193 2831.493    10

sumLogSlow = function(n) {
  total <- 0
  x <- runif(n, 1, 100)
  for(i in 1:n) total <- total + log(x[i])
  total
}

sumLogFast = function(n) {
  x <- runif(n, 1, 100)
  sum(log(x))
}

n = 1000
microbenchmark(sumLogFast(n), sumLogSlow(n), times = 10)

## Unit: microseconds
##           expr    min     lq     mean  median     uq      max neval
##  sumLogFast(n) 32.549 32.985 148.1281 33.2635 41.733 1165.449    10
##  sumLogSlow(n) 84.370 84.848 301.1464 85.3225 86.419 2243.039    10

# Data frames or matrices?

# A data frame
df = data.frame(
  a = lubridate::now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1:1e3,
  d = runif(1e3),
  e = sample(letters, 1e3, replace = TRUE)
)

# A matrix
mat = matrix(runif(6e3), ncol = 6)

# Get a column
microbenchmark(mat[, 3], df[, 3])

## Unit: microseconds
##      expr   min     lq    mean median    uq    max neval
##  mat[, 3] 2.281 2.4525 2.58998 2.5375 2.634  6.976   100
##   df[, 3] 4.595 4.8090 5.26357 5.0190 5.226 25.595   100

# Get a row
microbenchmark(mat[1, ], df[1, ])

## Unit: nanoseconds
##      expr   min      lq     mean  median      uq    max neval
##  mat[1, ]   390   439.5   651.17   647.5   725.0   4121   100
##   df[1, ] 43353 44170.5 46151.43 44656.0 45109.5 146067   100

Diagnosing Problems: Code Profiling

# Load the profvis package
library(profvis)

movies = readRDS("movies.rds")

# Profile the following code
profvis({
  # Load and select data
  movies <- movies[movies$Comedy == 1, ]

  # Plot data of interest
  plot(movies$year, movies$rating)

  # Loess (local polynomial) regression line
  model <- loess(rating ~ year, data = movies)
  
  # Add fitted line to the plot
  j <- order(movies$year)
  lines(movies$year[j], model$fitted[j], col = "red")
})