Write in SQL, Base R and dplyr the following queries. Run the queries with respect to the dataset nycflights13. Before querying, add an surrogate key called id to the data frame flights. Transformation in base R is realized with the following functions:
These are the queries:
library(dplyr)
library(nycflights13)
# add id to flights
flights = mutate(flights, id = row_number()) %>% select(id, everything())
-- flights on Christmas select * from flights where month = 12 and day = 25
# base R
subset(flights, month == 12 & day == 25 )
# dplyr
filter(flights, month == 12 & day == 25)
-- flights that have a valid (not null) delay either on departure or on arrival select * from flights where dep_delay is not null or arr_delay is not null
# base R
subset(flights, !is.na(dep_delay) | !is.na(arr_delay))
# dplyr
filter(flights, !is.na(dep_delay) | !is.na(arr_delay))
-- flights that have a valid (not null) delay both on departure and on arrival select * from flights where dep_time is not null and arr_time is not null
# base R
subset(flights, !is.na(dep_time) & !is.na(arr_time))
# dplyr
filter(flights, !is.na(dep_time) & !is.na(arr_time))
-- flights that have a positive (greater than 0) departure delay sorted by delay select * from flights where dep_delay > 0 order by dep_delay desc
# base R
df = subset(flights, dep_delay > 0)
df[order(desc(df$dep_delay)), ]
# dplyr
filter(flights, dep_delay > 0) %>%
arrange(desc(dep_delay))
-- flights that catched up during the flight sorted by catch up time select id, dep_delay, arr_delay, (dep_delay - arr_delay) as catchup from flights where catchup > 0 order by catchup desc
# base R
df = transform(flights, catchup = dep_delay - arr_delay)
df = subset(df, subset = (catchup > 0), select = c(id, dep_delay, arr_delay, catchup))
df = df[order(desc(df$catchup)), ]
head(df, 10)
# dplyr
mutate(flights, catchup = dep_delay - arr_delay) %>%
select(id, dep_delay, arr_delay, catchup) %>%
filter(catchup > 0) %>%
arrange(desc(catchup))
-- the number of flights per day select month, day, count(*) as count from flights group by month, day
# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = c("count")
head(df, 10)
# dplyr
group_by(flights, month, day) %>%
summarise(count = n())
# or
count(flights, month, day)
-- the busy days (with more than 1000 flights) select month, day, count(*) as count from flights group by month, day having count > 1000;
# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = "count"
df = subset(df, count > 1000)
head(df, 10)
# dplyr
count(flights, month, day) %>%
filter(n > 1000)
-- the number of flights per destination select dest, count(*) as count from flights group by dest;
# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
head(df, 10)
# dplyr
count(flights, dest)
-- the popular destinations (with more than 365 flights) sorted by number of flights in descending order select dest, count(*) as count from flights group by dest having count > 365 order by count desc;
# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
df = subset(df, count > 365)
df = df[order(desc(df$count)), ]
head(df, 10)
# dplyr
count(flights, dest) %>%
filter(n > 365) %>%
arrange(desc(n))
-- the mean departure delay per day sorted in decreasing order of all flights on busy days of July select month, day, count(*) as count, round(avg(dep_delay), 2) as avg_delay from flights where month = 7 group by month, day having count > 1000 order by avg_delay desc;
# base R (stack overflow).
# na.action = NULL forces the default na.action = na.omit, hence we do not ignore the missing values. This allows length() to count
# these observations. The mean() function ignores them because of na.rm = TRUE
df = subset(flights, month == 7)
df = aggregate(dep_delay ~ day + month, df,
FUN = function(x) cbind(count = length(x), avg_delay = mean(x, na.rm = TRUE)),
na.action = NULL)
# df$dep_delay is a matrix
str(df)
# unnest
df2 = cbind(df[,1:2], df[,3])
colnames(df2)[3:4] = c("count", "avg_delay")
# filter
df2 = subset(df2, count > 1000)
# order
df2 = df2[order(desc(df2$avg_delay)), ]
head(df2)
# dplyr
filter(flights, month == 7) %>%
group_by(month, day) %>%
summarise(n = n(), avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
filter(n > 1000) %>%
arrange(desc(avg_delay))
-- flights that flew with a plane manufactured by BOEING select flights.id, flights.tailnum, planes.manufacturer from flights, planes where flights.tailnum = planes.tailnum and planes.manufacturer = "BOEING"
# base R (inner join)
df = merge(flights, subset(planes, manufacturer == "BOEING"), by="tailnum")
df = subset(df, select = c("id", "tailnum", "manufacturer"))
head(df, 10)
# dplyr
inner_join(flights, planes, by="tailnum") %>%
select(id, tailnum, manufacturer) %>%
filter(manufacturer == "BOEING")
# dplyr (more efficient)
inner_join(flights, filter(planes, manufacturer == "BOEING"), by="tailnum") %>%
select(id, tailnum, manufacturer)
-- flights that flew to a destination with an altitude greater than 6000 feet sorted by altitude select flights.id, flights.dest, airports.name, airports.alt from flights, airports where flights.dest = airports.faa and airports.alt > 6000 order by airports.alt;
# base R
df = merge(flights, subset(airports, alt > 6000), by.x = "dest", by.y = "faa")
df = subset(df, select = c("id", "dest", "name", "alt"))
df = df[order(df$alt), ]
head(df, 10)
# dplyr
inner_join(flights, filter(airports, alt > 6000), by=c("dest" = "faa")) %>%
select(id, dest, name, alt) %>%
arrange(alt)
-- flights that took off with a plane with 4 engines and a visibility lower than 3 miles select flights.id, planes.engines, weather.visib from flights, weather, planes where flights.month = weather.month and flights.day = weather.day and flights.hour = weather.hour and flights.origin = weather.origin and weather.visib < 3 and flights.tailnum = planes.tailnum and planes.engines = 4
# base R
df = subset(flights, select = -time_hour)
df = merge(df, subset(weather, visib < 3))
df = merge(df, subset(planes, engines == 4), by = "tailnum")
df = subset(df, select = c("id", "engines", "visib"))
head(df, 10)
# dplyr
select(flights, -time_hour) %>%
inner_join(filter(weather, visib < 3)) %>%
inner_join(filter(planes, engines == 4), by = "tailnum") %>%
select(id, engines, visib)
-- flights with destination and origin airports with an altitude difference of more than 6000 feet select flights.id, airports2.alt, airports1.alt, (airports2.alt - airports1.alt) as altdelta from flights, airports as airports1, airports as airports2 where flights.origin = airports1.faa and flights.dest = airports2.faa and altdelta > 6000
# base R
df = merge(flights, airports, by.x = "origin", by.y = "faa")
df = merge(df, airports, by.x = "dest", by.y = "faa")
df = subset(df, select = c("id", "alt.x", "alt.y"))
df = transform(df, altdelta = alt.y - alt.x)
df = subset(df, altdelta > 6000)
head(df, 10)
# dplyr
inner_join(flights, airports, by = c("origin" = "faa")) %>%
inner_join(airports, by = c("dest" = "faa")) %>%
select(id, alt.x, alt.y) %>%
mutate(altdelta = alt.y - alt.x) %>%
filter(altdelta > 6000) # cannot anticipate filter since depends on joins
# Alternative: simulate non equi-join in dplyr using dummy variables (stack overflow)
inner_join(mutate(airports, dummy = TRUE), mutate(airports, dummy = TRUE), by = "dummy") %>%
mutate(altdelta = alt.y - alt.x) %>%
filter(altdelta > 6000) %>%
inner_join(flights, by = c("faa.x" = "origin", "faa.y" = "dest")) %>%
select(id, alt.x, alt.y, altdelta)