Base R or dplyr?

Write the following queries in SQL and R (both base R and dplyr). For base R, use the following functions:

subset to filter rows and select columns;
order to arrange rows;
transform to add variables;
aggregate to group rows;
merge to join data frames.

Run the queries with respect to the dataset nycflights13. Before querying, add an surrogate key called id to the data frame flights.

flights on Christmas
flights that have a delay (either on departure or on arrival)
flights that were not cancelled (that is, those with valid departure and arrival times)
flights that have a departure delay sorted by delay
flights that catched up during the flight sorted by catch up time
the number of flights per day
the busy days (with more than 1000 flights)
the number of flights per destination
the popular destinations (with more than 365 flights) sorted by number of flights in descending order
the mean departure delay per day sorted in decreasing order of all flights on busy days of July
flights that flew with a plane manufactured by BOEING
flights that flew to a destination with an altitude greater than 6000 feet sorted by altitude
flights that took off with a plane with 4 engines and a visibility lower than 3 miles
flights with destination and origin airports with an altitude difference of more than 6000 feet

library(dplyr)
library(nycflights13)

# add id to flights
flights = mutate(flights, id = row_number()) %>% select(id, everything())

-- flights on Christmas
select *
from flights 
where month = 12 and day = 25

# dplyr
filter(flights, month == 12 & day == 25)

## # A tibble: 719 x 20
##        id  year month   day dep_time sched_dep_time dep_delay arr_time
##     <int> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1 105233  2013    12    25      456            500        -4      649
##  2 105234  2013    12    25      524            515         9      805
##  3 105235  2013    12    25      542            540         2      832
##  4 105236  2013    12    25      546            550        -4     1022
##  5 105237  2013    12    25      556            600        -4      730
##  6 105238  2013    12    25      557            600        -3      743
##  7 105239  2013    12    25      557            600        -3      818
##  8 105240  2013    12    25      559            600        -1      855
##  9 105241  2013    12    25      559            600        -1      849
## 10 105242  2013    12    25      600            600         0      850
## # ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

# R base
subset(flights, month == 12 & day == 25 )

## # A tibble: 719 x 20
##        id  year month   day dep_time sched_dep_time dep_delay arr_time
##     <int> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1 105233  2013    12    25      456            500        -4      649
##  2 105234  2013    12    25      524            515         9      805
##  3 105235  2013    12    25      542            540         2      832
##  4 105236  2013    12    25      546            550        -4     1022
##  5 105237  2013    12    25      556            600        -4      730
##  6 105238  2013    12    25      557            600        -3      743
##  7 105239  2013    12    25      557            600        -3      818
##  8 105240  2013    12    25      559            600        -1      855
##  9 105241  2013    12    25      559            600        -1      849
## 10 105242  2013    12    25      600            600         0      850
## # ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

-- flights that have a delay (either on departure or on arrival)
select *
from flights 
where dep_delay is not null or arr_delay is not null

# dplyr
filter(flights, !is.na(dep_delay) | !is.na(arr_delay))

## # A tibble: 328,521 x 20
##       id  year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1     1  2013     1     1      517            515         2      830
##  2     2  2013     1     1      533            529         4      850
##  3     3  2013     1     1      542            540         2      923
##  4     4  2013     1     1      544            545        -1     1004
##  5     5  2013     1     1      554            600        -6      812
##  6     6  2013     1     1      554            558        -4      740
##  7     7  2013     1     1      555            600        -5      913
##  8     8  2013     1     1      557            600        -3      709
##  9     9  2013     1     1      557            600        -3      838
## 10    10  2013     1     1      558            600        -2      753
## # ... with 328,511 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

# base R
subset(flights, !is.na(dep_delay) | !is.na(arr_delay))

## # A tibble: 328,521 x 20
##       id  year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1     1  2013     1     1      517            515         2      830
##  2     2  2013     1     1      533            529         4      850
##  3     3  2013     1     1      542            540         2      923
##  4     4  2013     1     1      544            545        -1     1004
##  5     5  2013     1     1      554            600        -6      812
##  6     6  2013     1     1      554            558        -4      740
##  7     7  2013     1     1      555            600        -5      913
##  8     8  2013     1     1      557            600        -3      709
##  9     9  2013     1     1      557            600        -3      838
## 10    10  2013     1     1      558            600        -2      753
## # ... with 328,511 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

-- flights that were not cancelled (that is, those with valid departure and arrival times)
select *
from flights 
where dep_time is not null and arr_time is not null

# dplyr
filter(flights, !is.na(dep_time) & !is.na(arr_time))

## # A tibble: 328,063 x 20
##       id  year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1     1  2013     1     1      517            515         2      830
##  2     2  2013     1     1      533            529         4      850
##  3     3  2013     1     1      542            540         2      923
##  4     4  2013     1     1      544            545        -1     1004
##  5     5  2013     1     1      554            600        -6      812
##  6     6  2013     1     1      554            558        -4      740
##  7     7  2013     1     1      555            600        -5      913
##  8     8  2013     1     1      557            600        -3      709
##  9     9  2013     1     1      557            600        -3      838
## 10    10  2013     1     1      558            600        -2      753
## # ... with 328,053 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

# base R
subset(flights, !is.na(dep_time) & !is.na(arr_time))

## # A tibble: 328,063 x 20
##       id  year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1     1  2013     1     1      517            515         2      830
##  2     2  2013     1     1      533            529         4      850
##  3     3  2013     1     1      542            540         2      923
##  4     4  2013     1     1      544            545        -1     1004
##  5     5  2013     1     1      554            600        -6      812
##  6     6  2013     1     1      554            558        -4      740
##  7     7  2013     1     1      555            600        -5      913
##  8     8  2013     1     1      557            600        -3      709
##  9     9  2013     1     1      557            600        -3      838
## 10    10  2013     1     1      558            600        -2      753
## # ... with 328,053 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

-- flights that have a departure delay sorted by delay
select *
from flights 
where dep_delay > 0
order by dep_delay desc

# dplyr
filter(flights, dep_delay > 0) %>%
  arrange(desc(dep_delay))

## # A tibble: 128,432 x 20
##        id  year month   day dep_time sched_dep_time dep_delay arr_time
##     <int> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1   7073  2013     1     9      641            900      1301     1242
##  2 235779  2013     6    15     1432           1935      1137     1607
##  3   8240  2013     1    10     1121           1635      1126     1239
##  4 327044  2013     9    20     1139           1845      1014     1457
##  5 270377  2013     7    22      845           1600      1005     1044
##  6 173993  2013     4    10     1100           1900       960     1342
##  7 151975  2013     3    17     2321            810       911      135
##  8 247041  2013     6    27      959           1900       899     1236
##  9 270988  2013     7    22     2257            759       898      121
## 10  87239  2013    12     5      756           1700       896     1058
## # ... with 128,422 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

# base R
df = subset(flights, dep_delay > 0)
df[order(desc(df$dep_delay)), ]

## # A tibble: 128,432 x 20
##        id  year month   day dep_time sched_dep_time dep_delay arr_time
##     <int> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1   7073  2013     1     9      641            900      1301     1242
##  2 235779  2013     6    15     1432           1935      1137     1607
##  3   8240  2013     1    10     1121           1635      1126     1239
##  4 327044  2013     9    20     1139           1845      1014     1457
##  5 270377  2013     7    22      845           1600      1005     1044
##  6 173993  2013     4    10     1100           1900       960     1342
##  7 151975  2013     3    17     2321            810       911      135
##  8 247041  2013     6    27      959           1900       899     1236
##  9 270988  2013     7    22     2257            759       898      121
## 10  87239  2013    12     5      756           1700       896     1058
## # ... with 128,422 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

-- flights that catched up during the flight sorted by catch up time
select id, dep_delay, arr_delay, (dep_delay - arr_delay) as catchup
from flights
where catchup > 0
order by catchup desc

# dplyr
mutate(flights, catchup = dep_delay - arr_delay) %>%
  select(id, dep_delay, arr_delay, catchup) %>%
  filter(catchup > 0) %>%
  arrange(desc(catchup))

## # A tibble: 221,565 x 4
##        id dep_delay arr_delay catchup
##     <int>     <dbl>     <dbl>   <dbl>
##  1 234103       235       126     109
##  2 133682        60       -27      87
##  3 131144       206       126      80
##  4 205313        17       -62      79
##  5 134563        24       -52      76
##  6 263236        48       -26      74
##  7 266273        34       -40      74
##  8 107573        31       -42      73
##  9 195237        -2       -75      73
## 10  67835         9       -63      72
## # ... with 221,555 more rows

# base R
df = transform(flights, catchup = dep_delay - arr_delay)
df = subset(df, subset = (catchup > 0), select = c(id, dep_delay, arr_delay, catchup))
df = df[order(desc(df$catchup)), ]
head(df, 10)

##            id dep_delay arr_delay catchup
## 234103 234103       235       126     109
## 133682 133682        60       -27      87
## 131144 131144       206       126      80
## 205313 205313        17       -62      79
## 134563 134563        24       -52      76
## 263236 263236        48       -26      74
## 266273 266273        34       -40      74
## 107573 107573        31       -42      73
## 195237 195237        -2       -75      73
## 67835   67835         9       -63      72

-- the number of flights per day
select month, day, count(*) as count
from flights
group by month, day

# dplyr
group_by(flights, month, day) %>%
  summarise(count = n())

## # A tibble: 365 x 3
## # Groups:   month [?]
##    month   day count
##    <int> <int> <int>
##  1     1     1   842
##  2     1     2   943
##  3     1     3   914
##  4     1     4   915
##  5     1     5   720
##  6     1     6   832
##  7     1     7   933
##  8     1     8   899
##  9     1     9   902
## 10     1    10   932
## # ... with 355 more rows

# or
count(flights, month, day)

## # A tibble: 365 x 3
##    month   day     n
##    <int> <int> <int>
##  1     1     1   842
##  2     1     2   943
##  3     1     3   914
##  4     1     4   915
##  5     1     5   720
##  6     1     6   832
##  7     1     7   933
##  8     1     8   899
##  9     1     9   902
## 10     1    10   932
## # ... with 355 more rows

# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = c("count")
head(df, 10)

##    day month count
## 1    1     1   842
## 2    2     1   943
## 3    3     1   914
## 4    4     1   915
## 5    5     1   720
## 6    6     1   832
## 7    7     1   933
## 8    8     1   899
## 9    9     1   902
## 10  10     1   932

-- the busy days (with more than 1000 flights)
select month, day, count(*) as count
from flights
group by month, day
having count > 1000;

# dplyr
count(flights, month, day) %>%
  filter(n > 1000)

## # A tibble: 14 x 3
##    month   day     n
##    <int> <int> <int>
##  1     7     8  1004
##  2     7     9  1001
##  3     7    10  1004
##  4     7    11  1006
##  5     7    12  1002
##  6     7    17  1001
##  7     7    18  1003
##  8     7    25  1003
##  9     7    31  1001
## 10     8     7  1001
## 11     8     8  1001
## 12     8    12  1001
## 13    11    27  1014
## 14    12     2  1004

# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = "count"
df = subset(df, count > 1000)
head(df, 10)

##     day month count
## 189   8     7  1004
## 190   9     7  1001
## 191  10     7  1004
## 192  11     7  1006
## 193  12     7  1002
## 198  17     7  1001
## 199  18     7  1003
## 206  25     7  1003
## 212  31     7  1001
## 219   7     8  1001

-- the number of flights per destination
select dest, count(*) as count
from flights
group by dest;

# dplyr
count(flights, dest)

## # A tibble: 105 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     265
##  3 ALB     439
##  4 ANC       8
##  5 ATL   17215
##  6 AUS    2439
##  7 AVL     275
##  8 BDL     443
##  9 BGR     375
## 10 BHM     297
## # ... with 95 more rows

# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
head(df, 10)

##    dest count
## 1   ABQ   254
## 2   ACK   265
## 3   ALB   439
## 4   ANC     8
## 5   ATL 17215
## 6   AUS  2439
## 7   AVL   275
## 8   BDL   443
## 9   BGR   375
## 10  BHM   297

-- the popular destinations (with more than 365 flights) sorted by number of flights in descending order
select dest, count(*) as count
from flights
group by dest
having count > 365
order by count desc;

# dplyr
count(flights, dest) %>%
  filter(n > 365) %>%
  arrange(desc(n))

## # A tibble: 77 x 2
##    dest      n
##    <chr> <int>
##  1 ORD   17283
##  2 ATL   17215
##  3 LAX   16174
##  4 BOS   15508
##  5 MCO   14082
##  6 CLT   14064
##  7 SFO   13331
##  8 FLL   12055
##  9 MIA   11728
## 10 DCA    9705
## # ... with 67 more rows

# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
df = subset(df, count > 365)
df = df[order(desc(df$count)), ]
head(df, 10)

##    dest count
## 70  ORD 17283
## 5   ATL 17215
## 50  LAX 16174
## 12  BOS 15508
## 55  MCO 14082
## 24  CLT 14064
## 91  SFO 13331
## 36  FLL 12055
## 59  MIA 11728
## 29  DCA  9705

-- the mean departure delay per day sorted in decreasing order of all flights on busy days of summer 
select month, day, count(*) as count, round(avg(dep_delay), 2) as avg_delay
from flights
where month = 6 or month = 7 or month = 8
group by month, day
having count > 1000
order by avg_delay desc;

# dplyr
filter(flights, month %in% 6:8) %>%
  group_by(month, day) %>%
  summarise(n = n(), avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
  filter(n > 1000) %>%
  arrange(desc(avg_delay))

## # A tibble: 12 x 4
## # Groups:   month [2]
##    month   day     n avg_delay
##    <int> <int> <int>     <dbl>
##  1     7    10  1004     52.9 
##  2     8     8  1001     43.3 
##  3     7     8  1004     37.3 
##  4     7     9  1001     30.7 
##  5     7    12  1002     25.1 
##  6     7    11  1006     23.6 
##  7     7    18  1003     20.6 
##  8     7    25  1003     19.7 
##  9     7    17  1001     13.7 
## 10     8     7  1001      8.68
## 11     8    12  1001      8.31
## 12     7    31  1001      6.28

# base R (stack overflow)
df = aggregate(dep_delay ~ day + month, flights, 
               FUN = function(x) cbind(count = length(x), avg_delay = mean(x, na.rm = TRUE)), 
               na.action = NULL)
# df$dep_delay is a matrix
str(df)

## 'data.frame':    365 obs. of  3 variables:
##  $ day      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ month    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_delay: num [1:365, 1:2] 842 943 914 915 720 832 933 899 902 932 ...

# unnest
df2 = cbind(df[,1:2], df[,3])
colnames(df2)[3:4] = c("count", "avg_delay")

# filter
df2 = subset(df2, count > 1000)

# order
df2 = df2[order(desc(df2$avg_delay)), ]
df2

##     day month count avg_delay
## 191  10     7  1004 52.860702
## 220   8     8  1001 43.349947
## 189   8     7  1004 37.296646
## 190   9     7  1001 30.711499
## 193  12     7  1002 25.096154
## 192  11     7  1006 23.609392
## 199  18     7  1003 20.626789
## 206  25     7  1003 19.674134
## 331  27    11  1014 16.697651
## 198  17     7  1001 13.670707
## 336   2    12  1004  9.021978
## 219   7     8  1001  8.680402
## 224  12     8  1001  8.308157
## 212  31     7  1001  6.280843

-- flights that flew with a plane manufactured by BOEING
select flights.id, flights.tailnum, planes.manufacturer
from flights, planes
where flights.tailnum = planes.tailnum and planes.manufacturer = "BOEING"

# dplyr
left_join(flights, planes, by="tailnum") %>%
  select(id, tailnum, manufacturer) %>%
  filter(manufacturer == "BOEING")

## # A tibble: 82,912 x 3
##       id tailnum manufacturer
##    <int> <chr>   <chr>       
##  1     1 N14228  BOEING      
##  2     2 N24211  BOEING      
##  3     3 N619AA  BOEING      
##  4     5 N668DN  BOEING      
##  5     6 N39463  BOEING      
##  6    13 N29129  BOEING      
##  7    14 N53441  BOEING      
##  8    17 N76515  BOEING      
##  9    23 N633AA  BOEING      
## 10    24 N3739P  BOEING      
## # ... with 82,902 more rows

# base R
df = merge(flights, planes, by="tailnum", all.x = TRUE)
df = subset(df, subset = (manufacturer == "BOEING"), select = c("id", "tailnum", "manufacturer"))
head(df, 10)

##          id tailnum manufacturer
## 4426 220945  N11206       BOEING
## 4427 252368  N11206       BOEING
## 4428  12885  N11206       BOEING
## 4429 132376  N11206       BOEING
## 4430 302467  N11206       BOEING
## 4431 288196  N11206       BOEING
## 4432 169060  N11206       BOEING
## 4433  55825  N11206       BOEING
## 4434 195062  N11206       BOEING
## 4435 218017  N11206       BOEING

-- flights that flew to a destination with an altitude greater than 6000 feet sorted by altitude
select flights.id, flights.dest, airports.name, airports.alt
from flights, airports
where flights.dest = airports.faa and airports.alt > 6000
order by airports.alt;

# dplyr
left_join(flights, airports, by=c("dest" = "faa")) %>%
  select(id, dest, name, alt) %>%
  filter(alt > 6000) %>%
  arrange(alt)

## # A tibble: 253 x 4
##        id dest  name                   alt
##     <int> <chr> <chr>                <int>
##  1    153 JAC   Jackson Hole Airport  6451
##  2   1068 JAC   Jackson Hole Airport  6451
##  3 100067 JAC   Jackson Hole Airport  6451
##  4 101062 JAC   Jackson Hole Airport  6451
##  5 101913 JAC   Jackson Hole Airport  6451
##  6 102007 JAC   Jackson Hole Airport  6451
##  7 102805 JAC   Jackson Hole Airport  6451
##  8 103800 JAC   Jackson Hole Airport  6451
##  9 104658 JAC   Jackson Hole Airport  6451
## 10 105490 JAC   Jackson Hole Airport  6451
## # ... with 243 more rows

# base R
df = merge(flights, airports, by.x = "dest", by.y = "faa", all.x = TRUE)
df = subset(df, subset = (alt > 6000), select = c("id", "dest", "name", "alt"))
df = df[order(df$alt), ]
head(df, 10)

##            id dest                 name  alt
## 154976    153  JAC Jackson Hole Airport 6451
## 154977 157037  JAC Jackson Hole Airport 6451
## 154978 102007  JAC Jackson Hole Airport 6451
## 154979 131009  JAC Jackson Hole Airport 6451
## 154980 104658  JAC Jackson Hole Airport 6451
## 154981 118930  JAC Jackson Hole Airport 6451
## 154982 101062  JAC Jackson Hole Airport 6451
## 154983 108845  JAC Jackson Hole Airport 6451
## 154984 110520  JAC Jackson Hole Airport 6451
## 154985 108070  JAC Jackson Hole Airport 6451

-- flights that took off with a plane with 4 engines and a visibility lower than 3 miles
select flights.id, planes.engines, weather.visib
from flights, weather, planes
where flights.month = weather.month and flights.day = weather.day and flights.hour = weather.hour and flights.origin = weather.origin and weather.visib < 3 and flights.tailnum = planes.tailnum and planes.engines = 4

# dplyr
select(flights, -time_hour) %>%
  left_join(weather) %>%
  left_join(planes, by = "tailnum") %>%
  select(id, engines, visib) %>%
  filter(visib < 3, engines == 4)

## Joining, by = c("year", "month", "day", "origin", "hour")

## # A tibble: 11 x 3
##        id engines visib
##     <int>   <int> <dbl>
##  1  10066       4  2.5 
##  2  11278       4  0.25
##  3  25295       4  0   
##  4 120234       4  0.06
##  5 120640       4  0.12
##  6 120734       4  0.06
##  7 182566       4  0.12
##  8 210774       4  1.25
##  9 213173       4  0.25
## 10 227971       4  1   
## 11 228191       4  2

# base R
df = subset(flights, select = -time_hour)
df = merge(df, weather, all.x = TRUE)
df = merge(df, planes, by = "tailnum", all.x = TRUE)
df = subset(df, subset = (visib < 3 & engines == 4), select = c("id", "engines", "visib"))
head(df, 10)

##            id engines visib
## 120872  25295       4  0.00
## 235911 182566       4  0.12
## 292212 120734       4  0.06
## 292225  10066       4  2.50
## 292232  11278       4  0.25
## 292259 210774       4  1.25
## 292266 227971       4  1.00
## 292276 120640       4  0.12
## 292284 228191       4  2.00
## 292303 120234       4  0.06

-- flights with destination and origin airports with an altitude difference of more than 6000 feet
select flights.id, airports2.alt, airports1.alt, (airports2.alt - airports1.alt) as altdelta
from flights, airports as airports1, airports as airports2
where flights.origin = airports1.faa and flights.dest = airports2.faa and altdelta > 6000

# dplyr
left_join(flights, airports, by = c("origin" = "faa")) %>%
  left_join(airports, by = c("dest" = "faa")) %>%
  select(id, alt.x, alt.y) %>%
  mutate(altdelta = alt.y - alt.x) %>%
  filter(altdelta > 6000)

## # A tibble: 253 x 4
##       id alt.x alt.y altdelta
##    <int> <int> <int>    <int>
##  1   153    18  6451     6433
##  2   194    18  6540     6522
##  3   571    13  6540     6527
##  4  1068    18  6451     6433
##  5  1095    18  6540     6522
##  6  1492    13  6540     6527
##  7  2052    18  6540     6522
##  8  2509    13  6540     6527
##  9  2975    18  6540     6522
## 10  3477    13  6540     6527
## # ... with 243 more rows

# base R
df = merge(flights, airports, by.x = "origin", by.y = "faa", all.x = TRUE)
df = merge(df, airports, by.x = "dest", by.y = "faa", all.x = TRUE)
df = subset(df, select = c("id", "alt.x", "alt.y"))
df = transform(df, altdelta = alt.y - alt.x)
df = subset(df, altdelta > 6000)
head(df, 10)

##            id alt.x alt.y altdelta
## 121549 155570    13  6540     6527
## 121550 149990    13  6540     6527
## 121551 125764    13  6540     6527
## 121552 149018    13  6540     6527
## 121553 165081    18  6540     6522
## 121554 109847    18  6540     6522
## 121555   1095    18  6540     6522
## 121556 122845    18  6540     6522
## 121557 135554    18  6540     6522
## 121558   7264    18  6540     6522