The beauty of R is that it is built for performing data analysis. The downside is that sometimes R can be slow, thereby obstructing our analysis. For this reason, it is essential to become familiar with the main techniques for speeding up your analysis, so you can reduce computational time and get insights as quickly as possible.
This page is based on Writing Efficient R Code DataCamp course by Colin Gillespie.
Datasets:
# Print the R version details using version
version
## _
## platform x86_64-apple-darwin15.6.0
## arch x86_64
## os darwin15.6.0
## system x86_64, darwin15.6.0
## status
## major 3
## minor 5.1
## year 2018
## month 07
## day 02
## svn rev 74947
## language R
## version.string R version 3.5.1 (2018-07-02)
## nickname Feather Spray
# How long does it take to read movies from CSV?
system.time(csv <- read.csv("movies.csv"))
## user system elapsed
## 0.416 0.019 0.438
# How long does it take to read movies from RDS?
system.time(rds <- readRDS("movies.rds"))
## user system elapsed
## 0.052 0.002 0.055
# Load the package
library(microbenchmark)
# Compare the two functions
compare <- microbenchmark(read.csv("movies.csv"),
readRDS("movies.rds"),
times = 10)
# Print compare
compare
## Unit: milliseconds
## expr min lq mean median uq
## read.csv("movies.csv") 408.35552 412.06844 457.26387 431.41877 462.35406
## readRDS("movies.rds") 48.03375 48.61207 59.06533 51.20348 72.76131
## max neval
## 666.12105 10
## 83.55275 10
# Dynamic or pre-allocated vectors?
growingSlow <- function(n) {
x <- NULL
for(i in 1:n) x <- c(x, rnorm(1))
x
}
growingFast <- function(n) {
x <- numeric(n) # Pre-allocate
for(i in 1:n) x[i] <- rnorm(1)
x
}
n <- 1000
microbenchmark(growingFast(n), growingSlow(n), times = 10)
## Unit: milliseconds
## expr min lq mean median uq max
## growingFast(n) 1.294030 1.926876 2.382776 2.309296 2.659062 4.331309
## growingSlow(n) 2.622895 3.346198 6.487071 4.531758 5.344352 19.671067
## neval
## 10
## 10
# For loops or vectorized operations?
powerSlow = function(n) {
x <- rnorm(n)
x2 <- numeric(length(x))
for(i in 1:n) x2[i] <- x[i] * x[i]
x2
}
powerFast = function(n) {
x <- rnorm(n)
x * x
}
n = 1000
microbenchmark(powerFast(n), powerSlow(n), times = 10)
## Unit: microseconds
## expr min lq mean median uq max neval
## powerFast(n) 57.834 58.240 173.0290 59.072 60.089 1189.470 10
## powerSlow(n) 128.467 129.456 401.1015 130.195 131.193 2831.493 10
sumLogSlow = function(n) {
total <- 0
x <- runif(n, 1, 100)
for(i in 1:n) total <- total + log(x[i])
total
}
sumLogFast = function(n) {
x <- runif(n, 1, 100)
sum(log(x))
}
n = 1000
microbenchmark(sumLogFast(n), sumLogSlow(n), times = 10)
## Unit: microseconds
## expr min lq mean median uq max neval
## sumLogFast(n) 32.549 32.985 148.1281 33.2635 41.733 1165.449 10
## sumLogSlow(n) 84.370 84.848 301.1464 85.3225 86.419 2243.039 10
# Data frames or matrices?
# A data frame
df = data.frame(
a = lubridate::now() + runif(1e3) * 86400,
b = lubridate::today() + runif(1e3) * 30,
c = 1:1e3,
d = runif(1e3),
e = sample(letters, 1e3, replace = TRUE)
)
# A matrix
mat = matrix(runif(6e3), ncol = 6)
# Get a column
microbenchmark(mat[, 3], df[, 3])
## Unit: microseconds
## expr min lq mean median uq max neval
## mat[, 3] 2.281 2.4525 2.58998 2.5375 2.634 6.976 100
## df[, 3] 4.595 4.8090 5.26357 5.0190 5.226 25.595 100
# Get a row
microbenchmark(mat[1, ], df[1, ])
## Unit: nanoseconds
## expr min lq mean median uq max neval
## mat[1, ] 390 439.5 651.17 647.5 725.0 4121 100
## df[1, ] 43353 44170.5 46151.43 44656.0 45109.5 146067 100
# Load the profvis package
library(profvis)
movies = readRDS("movies.rds")
# Profile the following code
profvis({
# Load and select data
movies <- movies[movies$Comedy == 1, ]
# Plot data of interest
plot(movies$year, movies$rating)
# Loess (local polynomial) regression line
model <- loess(rating ~ year, data = movies)
# Add fitted line to the plot
j <- order(movies$year)
lines(movies$year[j], model$fitted[j], col = "red")
})