Query in SQL and Base R

Query (transformation) in base R is realized with the following functions:

subset to filter rows and select columns;
order to arrange rows;
transform to add variables;
aggregate to group rows;
merge to join data frames.

We will introduce them writing queries with respect to the dataset nycflights13. We will write the queries both in SQL and base R. Before querying in base R, let’s add a surrogate key called id to the data frame flights:

library(dplyr)
library(nycflights13)

# add id to flights
flights = mutate(flights, id = 1:nrow(flights)) %>% select(id, everything())

flights on Christmas

-- SQL
select *
from flights 
where month = 12 and day = 25

# base R
subset(flights, month == 12 & day == 25 )

flights that have a delay (either on departure or on arrival)

-- SQL
select *
from flights 
where dep_delay is not null or arr_delay is not null

# base R
subset(flights, !is.na(dep_delay) | !is.na(arr_delay))

flights that were not cancelled (that is, those with valid departure and arrival times)

-- SQL
select *
from flights 
where dep_time is not null and arr_time is not null

# base R
subset(flights, !is.na(dep_time) & !is.na(arr_time))

flights that have a departure delay sorted by delay

-- SQL
select *
from flights 
where dep_delay > 0
order by dep_delay desc

# base R
df = subset(flights, dep_delay > 0)
df[order(desc(df$dep_delay)), ]

flights that catched up during the flight sorted by catch up time

-- SQL
select id, dep_delay, arr_delay, (dep_delay - arr_delay) as catchup
from flights
where catchup > 0
order by catchup desc

# base R
df = transform(flights, catchup = dep_delay - arr_delay)
df = subset(df, subset = (catchup > 0), select = c(id, dep_delay, arr_delay, catchup))
df[order(desc(df$catchup)), ]

The number of flights per day

-- SQL
select month, day, count(*) as count
from flights
group by month, day

# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = c("count")
df

The busy days (with more than 1000 flights)

-- SQL
select month, day, count(*) as count
from flights
group by month, day
having count > 1000;

# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = "count"
subset(df, count > 1000)

The number of flights per destination

-- SQL
select dest, count(*) as count
from flights
group by dest;

# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
df

The popular destinations (with more than 365 flights) sorted by number of flights in descending order

-- SQL
select dest, count(*) as count
from flights
group by dest
having count > 365
order by count desc;

# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
df = subset(df, count > 365)
df[order(desc(df$count)), ]

the mean departure delay per day sorted in decreasing order of all flights on busy days of summer

-- SQL
select month, day, count(*) as count, avg(dep_delay) as avg_delay
from flights
where month = 6 or month = 7 or month = 8
group by month, day
having count > 1000
order by avg_delay desc;

# base R (stack overflow)
df = aggregate(dep_delay ~ day + month, flights, 
               FUN = function(x) cbind(count = length(x), avg_delay = mean(x, na.rm = TRUE)), 
               na.action = NULL)

# df$dep_delay is a matrix
str(df)

# unnest
df = cbind(df[,1:2], df[,3])
colnames(df)[3:4] = c("count", "avg_delay")

# filter
df = subset(df, count > 1000)

# order
df[order(desc(df$avg_delay)), ]

Flights that flew with a plane manufactured by BOEING

-- SQL
select flights.id, flights.tailnum, planes.manufacturer
from flights, planes
where flights.tailnum = planes.tailnum and planes.manufacturer = "BOEING"

# base R
# all.x = TRUE --> left join
# all.y = TRUE --> right join
# all = TRUE   --> full join
# all = FALSE  --> inner join
df = merge(flights, planes, by="tailnum", all.x = TRUE)
subset(df, subset = (manufacturer == "BOEING"), select = c("id", "tailnum", "manufacturer"))

Flights that flew to a destination with an altitude greater than 6000 feet sorted by altitude

-- SQL
select flights.id, flights.dest, airports.name, airports.alt
from flights, airports
where flights.dest = airports.faa and airports.alt > 6000
order by airports.alt;

# base R
df = merge(flights, airports, by.x = "dest", by.y = "faa", all.x = TRUE)
df = subset(df, subset = (alt > 6000), select = c("id", "dest", "name", "alt"))
df[order(df$alt), ]

Flights that took off with a plane with 4 engines and a visibility lower than 3 miles

-- SQL
select flights.id, planes.engines, weather.visib
from flights, weather, planes
where flights.month = weather.month and flights.day = weather.day and flights.hour = weather.hour and 
      flights.origin = weather.origin and weather.visib < 3 and flights.tailnum = planes.tailnum and planes.engines = 4

# base R
df = merge(flights, weather, all.x = TRUE)
df = merge(df, planes, by = "tailnum", all.x = TRUE)
subset(df, subset = (visib < 3 & engines == 4), select = c("id", "engines", "visib"))

Flights with destination and origin airports with an altitude difference of more than 6000 feet

-- SQL
select flights.id, airports2.alt, airports1.alt, (airports2.alt - airports1.alt) as altdelta
from flights, airports as airports1, airports as airports2
where flights.origin = airports1.faa and flights.dest = airports2.faa and altdelta > 6000

# base R
df = merge(flights, airports, by.x = "origin", by.y = "faa", all.x = TRUE)
df = merge(df, airports, by.x = "dest", by.y = "faa", all.x = TRUE)
df = subset(df, select = c("id", "alt.x", "alt.y"))
df = transform(df, altdelta = alt.y - alt.x)
subset(df, altdelta > 6000)