To work with small and in particular big data, it is essential to become familiar with the main techniques for speeding up your analysis, so you can reduce computational time and get insights as quickly as possible. This page is based on Writing Efficient R Code DataCamp course by Colin Gillespie.
First, download the following datasets:
# Print the R version details using version
version
## _
## platform x86_64-apple-darwin17.0
## arch x86_64
## os darwin17.0
## system x86_64, darwin17.0
## status
## major 4
## minor 1.1
## year 2021
## month 08
## day 10
## svn rev 80725
## language R
## version.string R version 4.1.1 (2021-08-10)
## nickname Kick Things
# How long does it take to read movies from CSV?
system.time(csv <- read.csv("movies.csv"))
## utente sistema trascorso
## 0.217 0.015 0.235
# How long does it take to read movies from RDS?
system.time(rds <- readRDS("movies.rds"))
## utente sistema trascorso
## 0.047 0.002 0.050
# Load the package
library(microbenchmark)
# Compare the two functions in sub-millisecond (supposedly nanosecond)
compare <- microbenchmark(read.csv("movies.csv"),
readRDS("movies.rds"),
times = 10)
# Print compare
compare
## Unit: milliseconds
## expr min lq mean median uq
## read.csv("movies.csv") 218.32178 220.60399 228.1838 224.57968 228.85688
## readRDS("movies.rds") 46.26977 49.29614 52.5652 51.61593 57.66378
## max neval
## 248.69548 10
## 59.83807 10
# Dynamic or pre-allocated vectors?
growingSlow <- function(n) {
x <- NULL
for(i in 1:n) x <- c(x, rnorm(1))
x
}
growingFast <- function(n) {
x <- numeric(n) # Pre-allocate
for(i in 1:n) x[i] <- rnorm(1)
x
}
n <- 1000
microbenchmark(growingFast(n), growingSlow(n), times = 10)
## Unit: milliseconds
## expr min lq mean median uq max neval
## growingFast(n) 1.296160 1.327504 2.112312 1.790882 2.146779 5.345045 10
## growingSlow(n) 2.544553 4.039229 6.052685 4.607155 6.440413 15.015740 10
# For loops or vectorized operations?
powerSlow = function(n) {
x <- rnorm(n)
x2 <- numeric(length(x))
for(i in 1:n) x2[i] <- x[i] * x[i]
x2
}
powerFast = function(n) {
x <- rnorm(n)
x * x
}
n = 1000
microbenchmark(powerFast(n), powerSlow(n), times = 10)
## Unit: microseconds
## expr min lq mean median uq max neval
## powerFast(n) 42.499 43.499 155.0662 44.0445 46.347 1150.343 10
## powerSlow(n) 89.432 90.677 429.1356 90.9920 92.296 3471.203 10
sumLogSlow = function(n) {
total <- 0
x <- runif(n, 1, 100)
for(i in 1:n) total <- total + log(x[i])
total
}
sumLogFast = function(n) {
x <- runif(n, 1, 100)
sum(log(x))
}
n = 1000
microbenchmark(sumLogFast(n), sumLogSlow(n), times = 10)
## Unit: microseconds
## expr min lq mean median uq max neval
## sumLogFast(n) 20.483 20.997 156.1198 21.3800 21.595 1362.390 10
## sumLogSlow(n) 50.038 50.326 353.6485 50.7835 51.699 3073.641 10
# Data frames or matrices?
# A data frame
df = data.frame(
a = lubridate::now() + runif(1e3) * 86400,
b = lubridate::today() + runif(1e3) * 30,
c = 1:1e3,
d = runif(1e3),
e = sample(letters, 1e3, replace = TRUE)
)
# A matrix
mat = matrix(runif(5e3), ncol = 5)
# Get a column
microbenchmark(mat[, 3], df[, 3])
## Unit: microseconds
## expr min lq mean median uq max neval
## mat[, 3] 2.261 2.5010 2.77460 2.6555 2.8625 8.175 100
## df[, 3] 5.242 5.5295 6.76378 5.8495 6.3690 52.303 100
# Get a row
microbenchmark(mat[1, ], df[1, ])
## Unit: nanoseconds
## expr min lq mean median uq max neval
## mat[1, ] 384 442.5 688.33 575.5 630.5 13064 100
## df[1, ] 41157 42146.5 44754.87 42683.5 43431.0 211385 100
# Load the profvis package
library(profvis)
movies = readRDS("movies.rds")
# Profile the following code
profvis({
# Load and select data
movies <- movies[movies$Comedy == 1, ]
# Plot data of interest
plot(movies$year, movies$rating)
# Loess (local polynomial) regression line
model <- loess(rating ~ year, data = movies)
# Add fitted line to the plot
j <- order(movies$year)
lines(movies$year[j], model$fitted[j], col = "red")
})