Write in SQL, Base R and dplyr the following queries. Run the queries with respect to the dataset nycflights13. Before querying, add an surrogate key called id to the data frame flights. Transformation in base R is realized with the following functions:

These are the queries:

  1. flights on Christmas
  2. flights that have a valid (not null) delay either on departure or on arrival
  3. flights that have a valid (not null) delay both on departure and on arrival
  4. flights that have a positive (greater than 0) departure delay sorted by delay
  5. flights that catched up during the flight sorted by catch up time
  6. the number of flights per day
  7. the busy days (those with more than 1000 flights)
  8. the number of flights per destination
  9. the popular destinations (those with more than 365 flights) sorted by number of flights in descending order
  10. the mean departure delay per day sorted in decreasing order of all flights on busy days of July
  11. flights that flew with a plane manufactured by BOEING
  12. flights that flew to a destination with an altitude greater than 6000 feet sorted by altitude
  13. flights that took off with a plane with 4 engines and a visibility lower than 3 miles
  14. flights with destination and origin airports with an altitude difference of more than 6000 feet
library(dplyr)
library(nycflights13)
# add id to flights
flights = mutate(flights, id = row_number()) %>% select(id, everything())
-- flights on Christmas
select *
from flights 
where month = 12 and day = 25
# base R
subset(flights, month == 12 & day == 25 )
# dplyr
filter(flights, month == 12 & day == 25)

-- flights that have a valid (not null) delay either on departure or on arrival
select *
from flights 
where dep_delay is not null or arr_delay is not null
# base R
subset(flights, !is.na(dep_delay) | !is.na(arr_delay))
# dplyr
filter(flights, !is.na(dep_delay) | !is.na(arr_delay))

-- flights that have a valid (not null) delay both on departure and on arrival
select *
from flights 
where dep_time is not null and arr_time is not null
# base R
subset(flights, !is.na(dep_time) & !is.na(arr_time))
# dplyr
filter(flights, !is.na(dep_time) & !is.na(arr_time))

-- flights that have a positive (greater than 0) departure delay sorted by delay
select *
from flights 
where dep_delay > 0
order by dep_delay desc

# base R
df = subset(flights, dep_delay > 0)
df[order(desc(df$dep_delay)), ]
# dplyr
filter(flights, dep_delay > 0) %>%
  arrange(desc(dep_delay))

-- flights that catched up during the flight sorted by catch up time
select id, dep_delay, arr_delay, (dep_delay - arr_delay) as catchup
from flights
where catchup > 0
order by catchup desc

# base R
df = transform(flights, catchup = dep_delay - arr_delay)
df = subset(df, subset = (catchup > 0), select = c(id, dep_delay, arr_delay, catchup))
df = df[order(desc(df$catchup)), ]
head(df, 10)
# dplyr
mutate(flights, catchup = dep_delay - arr_delay) %>%
  select(id, dep_delay, arr_delay, catchup) %>%
  filter(catchup > 0) %>%
  arrange(desc(catchup)) 

-- the number of flights per day
select month, day, count(*) as count
from flights
group by month, day
# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = c("count")
head(df, 10)
# dplyr
group_by(flights, month, day) %>%
  summarise(count = n())
# or
count(flights, month, day)

-- the busy days (with more than 1000 flights)
select month, day, count(*) as count
from flights
group by month, day
having count > 1000;
# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = "count"
df = subset(df, count > 1000)
head(df, 10)
# dplyr
count(flights, month, day) %>%
  filter(n > 1000)

-- the number of flights per destination
select dest, count(*) as count
from flights
group by dest;
# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
head(df, 10)
# dplyr
count(flights, dest)

-- the popular destinations (with more than 365 flights) sorted by number of flights in descending order
select dest, count(*) as count
from flights
group by dest
having count > 365
order by count desc;
# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
df = subset(df, count > 365)
df = df[order(desc(df$count)), ]
head(df, 10)
# dplyr
count(flights, dest) %>%
  filter(n > 365) %>%
  arrange(desc(n))

-- the mean departure delay per day sorted in decreasing order of all flights on busy days of July 
select month, day, count(*) as count, round(avg(dep_delay), 2) as avg_delay
from flights
where month = 7
group by month, day
having count > 1000
order by avg_delay desc;
# base R (stack overflow). 
# na.action = NULL forces the default na.action = na.omit, hence we do not ignore the missing values. This allows length() to count 
# these observations. The mean() function ignores them because of na.rm = TRUE

df = subset(flights, month == 7)  

df = aggregate(dep_delay ~ day + month, df, 
               FUN = function(x) cbind(count = length(x), avg_delay = mean(x, na.rm = TRUE)), 
               na.action = NULL)


# df$dep_delay is a matrix
str(df)

# unnest
df2 = cbind(df[,1:2], df[,3])
colnames(df2)[3:4] = c("count", "avg_delay")

# filter
df2 = subset(df2, count > 1000)

# order
df2 = df2[order(desc(df2$avg_delay)), ]

head(df2)
# dplyr
filter(flights, month == 7) %>%
  group_by(month, day) %>%
  summarise(n = n(), avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
  filter(n > 1000) %>%
  arrange(desc(avg_delay))
-- flights that flew with a plane manufactured by BOEING
select flights.id, flights.tailnum, planes.manufacturer
from flights, planes
where flights.tailnum = planes.tailnum and planes.manufacturer = "BOEING"

# base R (inner join)
df = merge(flights, subset(planes, manufacturer == "BOEING"), by="tailnum")
df = subset(df, select = c("id", "tailnum", "manufacturer"))
head(df, 10)
# dplyr
inner_join(flights, planes, by="tailnum") %>%
  select(id, tailnum, manufacturer) %>%
  filter(manufacturer == "BOEING") 

# dplyr (more efficient)
inner_join(flights, filter(planes, manufacturer == "BOEING"),  by="tailnum") %>%
  select(id, tailnum, manufacturer)

-- flights that flew to a destination with an altitude greater than 6000 feet sorted by altitude
select flights.id, flights.dest, airports.name, airports.alt
from flights, airports
where flights.dest = airports.faa and airports.alt > 6000
order by airports.alt;
# base R
df = merge(flights, subset(airports, alt > 6000), by.x = "dest", by.y = "faa")
df = subset(df, select = c("id", "dest", "name", "alt"))
df = df[order(df$alt), ]
head(df, 10)
# dplyr
inner_join(flights, filter(airports, alt > 6000), by=c("dest" = "faa")) %>%
  select(id, dest, name, alt) %>%
  arrange(alt) 
-- flights that took off with a plane with 4 engines and a visibility lower than 3 miles
select flights.id, planes.engines, weather.visib
from flights, weather, planes
where flights.month = weather.month and flights.day = weather.day and flights.hour = weather.hour and flights.origin = weather.origin and weather.visib < 3 and flights.tailnum = planes.tailnum and planes.engines = 4

# base R
df = subset(flights, select = -time_hour)
df = merge(df, subset(weather, visib < 3))
df = merge(df, subset(planes, engines == 4), by = "tailnum")
df = subset(df, select = c("id", "engines", "visib"))
head(df, 10)
# dplyr
select(flights, -time_hour) %>%
  inner_join(filter(weather, visib < 3)) %>%
  inner_join(filter(planes, engines == 4), by = "tailnum") %>%
  select(id, engines, visib) 

-- flights with destination and origin airports with an altitude difference of more than 6000 feet
select flights.id, airports2.alt, airports1.alt, (airports2.alt - airports1.alt) as altdelta
from flights, airports as airports1, airports as airports2
where flights.origin = airports1.faa and flights.dest = airports2.faa and altdelta > 6000

# base R
df = merge(flights, airports, by.x = "origin", by.y = "faa")
df = merge(df, airports, by.x = "dest", by.y = "faa")
df = subset(df, select = c("id", "alt.x", "alt.y"))
df = transform(df, altdelta = alt.y - alt.x)
df = subset(df, altdelta > 6000)
head(df, 10)
# dplyr 
inner_join(flights, airports, by = c("origin" = "faa")) %>%
  inner_join(airports, by = c("dest" = "faa")) %>%
  select(id, alt.x, alt.y) %>%
  mutate(altdelta = alt.y - alt.x) %>%
  filter(altdelta > 6000) # cannot anticipate filter since depends on joins

# Alternative: simulate non equi-join in dplyr using dummy variables (stack overflow)
inner_join(mutate(airports, dummy = TRUE), mutate(airports, dummy = TRUE), by = "dummy") %>%
  mutate(altdelta = alt.y - alt.x) %>%
  filter(altdelta > 6000) %>%
  inner_join(flights, by = c("faa.x" = "origin", "faa.y" = "dest"))  %>%
  select(id, alt.x, alt.y, altdelta)