Query (transformation) in base R is realized with the following functions:

We will introduce them writing queries with respect to the dataset nycflights13. We will write the queries both in SQL and base R. Before querying in base R, letโ€™s add a surrogate key called id to the data frame flights:

library(dplyr)
library(nycflights13)

# add id to flights
flights = mutate(flights, id = 1:nrow(flights)) %>% select(id, everything())

flights on Christmas

-- SQL
select *
from flights 
where month = 12 and day = 25
# base R
subset(flights, month == 12 & day == 25 )

flights that have a delay (either on departure or on arrival)

-- SQL
select *
from flights 
where dep_delay is not null or arr_delay is not null
# base R
subset(flights, !is.na(dep_delay) | !is.na(arr_delay))

flights that were not cancelled (that is, those with valid departure and arrival times)

-- SQL
select *
from flights 
where dep_time is not null and arr_time is not null
# base R
subset(flights, !is.na(dep_time) & !is.na(arr_time))

flights that have a departure delay sorted by delay

-- SQL
select *
from flights 
where dep_delay > 0
order by dep_delay desc

# base R
df = subset(flights, dep_delay > 0)
df[order(desc(df$dep_delay)), ]

flights that catched up during the flight sorted by catch up time

-- SQL
select id, dep_delay, arr_delay, (dep_delay - arr_delay) as catchup
from flights
where catchup > 0
order by catchup desc

# base R
df = transform(flights, catchup = dep_delay - arr_delay)
df = subset(df, subset = (catchup > 0), select = c(id, dep_delay, arr_delay, catchup))
df[order(desc(df$catchup)), ]

The number of flights per day

-- SQL
select month, day, count(*) as count
from flights
group by month, day
# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = c("count")
df

The busy days (with more than 1000 flights)

-- SQL
select month, day, count(*) as count
from flights
group by month, day
having count > 1000;
# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = "count"
subset(df, count > 1000)

The number of flights per destination

-- SQL
select dest, count(*) as count
from flights
group by dest;
# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
df

The popular destinations (with more than 365 flights) sorted by number of flights in descending order

-- SQL
select dest, count(*) as count
from flights
group by dest
having count > 365
order by count desc;
# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
df = subset(df, count > 365)
df[order(desc(df$count)), ]

the mean departure delay per day sorted in decreasing order of all flights on busy days of summer

-- SQL
select month, day, count(*) as count, avg(dep_delay) as avg_delay
from flights
where month = 6 or month = 7 or month = 8
group by month, day
having count > 1000
order by avg_delay desc;
# base R (stack overflow)
df = aggregate(dep_delay ~ day + month, flights, 
               FUN = function(x) cbind(count = length(x), avg_delay = mean(x, na.rm = TRUE)), 
               na.action = NULL)

# df$dep_delay is a matrix
str(df)

# unnest
df = cbind(df[,1:2], df[,3])
colnames(df)[3:4] = c("count", "avg_delay")

# filter
df = subset(df, count > 1000)

# order
df[order(desc(df$avg_delay)), ]

Flights that flew with a plane manufactured by BOEING

-- SQL
select flights.id, flights.tailnum, planes.manufacturer
from flights, planes
where flights.tailnum = planes.tailnum and planes.manufacturer = "BOEING"

# base R
# all.x = TRUE --> left join
# all.y = TRUE --> right join
# all = TRUE   --> full join
# all = FALSE  --> inner join
df = merge(flights, planes, by="tailnum", all.x = TRUE)
subset(df, subset = (manufacturer == "BOEING"), select = c("id", "tailnum", "manufacturer"))

Flights that flew to a destination with an altitude greater than 6000 feet sorted by altitude

-- SQL
select flights.id, flights.dest, airports.name, airports.alt
from flights, airports
where flights.dest = airports.faa and airports.alt > 6000
order by airports.alt;
# base R
df = merge(flights, airports, by.x = "dest", by.y = "faa", all.x = TRUE)
df = subset(df, subset = (alt > 6000), select = c("id", "dest", "name", "alt"))
df[order(df$alt), ]

Flights that took off with a plane with 4 engines and a visibility lower than 3 miles

-- SQL
select flights.id, planes.engines, weather.visib
from flights, weather, planes
where flights.month = weather.month and flights.day = weather.day and flights.hour = weather.hour and 
      flights.origin = weather.origin and weather.visib < 3 and flights.tailnum = planes.tailnum and planes.engines = 4
# base R
df = merge(flights, weather, all.x = TRUE)
df = merge(df, planes, by = "tailnum", all.x = TRUE)
subset(df, subset = (visib < 3 & engines == 4), select = c("id", "engines", "visib"))

Flights with destination and origin airports with an altitude difference of more than 6000 feet

-- SQL
select flights.id, airports2.alt, airports1.alt, (airports2.alt - airports1.alt) as altdelta
from flights, airports as airports1, airports as airports2
where flights.origin = airports1.faa and flights.dest = airports2.faa and altdelta > 6000

# base R
df = merge(flights, airports, by.x = "origin", by.y = "faa", all.x = TRUE)
df = merge(df, airports, by.x = "dest", by.y = "faa", all.x = TRUE)
df = subset(df, select = c("id", "alt.x", "alt.y"))
df = transform(df, altdelta = alt.y - alt.x)
subset(df, altdelta > 6000)