Write the following queries in SQL and R (both base R and dplyr). For base R, use the following functions:
Run the queries with respect to the dataset nycflights13. Before querying, add an surrogate key called id to the data frame flights.
library(dplyr)
library(nycflights13)
# add id to flights
flights = mutate(flights, id = row_number()) %>% select(id, everything())
-- flights on Christmas select * from flights where month = 12 and day = 25
# dplyr
filter(flights, month == 12 & day == 25)
## # A tibble: 719 x 20
## id year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 105233 2013 12 25 456 500 -4 649
## 2 105234 2013 12 25 524 515 9 805
## 3 105235 2013 12 25 542 540 2 832
## 4 105236 2013 12 25 546 550 -4 1022
## 5 105237 2013 12 25 556 600 -4 730
## 6 105238 2013 12 25 557 600 -3 743
## 7 105239 2013 12 25 557 600 -3 818
## 8 105240 2013 12 25 559 600 -1 855
## 9 105241 2013 12 25 559 600 -1 849
## 10 105242 2013 12 25 600 600 0 850
## # ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# R base
subset(flights, month == 12 & day == 25 )
## # A tibble: 719 x 20
## id year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 105233 2013 12 25 456 500 -4 649
## 2 105234 2013 12 25 524 515 9 805
## 3 105235 2013 12 25 542 540 2 832
## 4 105236 2013 12 25 546 550 -4 1022
## 5 105237 2013 12 25 556 600 -4 730
## 6 105238 2013 12 25 557 600 -3 743
## 7 105239 2013 12 25 557 600 -3 818
## 8 105240 2013 12 25 559 600 -1 855
## 9 105241 2013 12 25 559 600 -1 849
## 10 105242 2013 12 25 600 600 0 850
## # ... with 709 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
-- flights that have a delay (either on departure or on arrival) select * from flights where dep_delay is not null or arr_delay is not null
# dplyr
filter(flights, !is.na(dep_delay) | !is.na(arr_delay))
## # A tibble: 328,521 x 20
## id year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 1 2013 1 1 517 515 2 830
## 2 2 2013 1 1 533 529 4 850
## 3 3 2013 1 1 542 540 2 923
## 4 4 2013 1 1 544 545 -1 1004
## 5 5 2013 1 1 554 600 -6 812
## 6 6 2013 1 1 554 558 -4 740
## 7 7 2013 1 1 555 600 -5 913
## 8 8 2013 1 1 557 600 -3 709
## 9 9 2013 1 1 557 600 -3 838
## 10 10 2013 1 1 558 600 -2 753
## # ... with 328,511 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# base R
subset(flights, !is.na(dep_delay) | !is.na(arr_delay))
## # A tibble: 328,521 x 20
## id year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 1 2013 1 1 517 515 2 830
## 2 2 2013 1 1 533 529 4 850
## 3 3 2013 1 1 542 540 2 923
## 4 4 2013 1 1 544 545 -1 1004
## 5 5 2013 1 1 554 600 -6 812
## 6 6 2013 1 1 554 558 -4 740
## 7 7 2013 1 1 555 600 -5 913
## 8 8 2013 1 1 557 600 -3 709
## 9 9 2013 1 1 557 600 -3 838
## 10 10 2013 1 1 558 600 -2 753
## # ... with 328,511 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
-- flights that were not cancelled (that is, those with valid departure and arrival times) select * from flights where dep_time is not null and arr_time is not null
# dplyr
filter(flights, !is.na(dep_time) & !is.na(arr_time))
## # A tibble: 328,063 x 20
## id year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 1 2013 1 1 517 515 2 830
## 2 2 2013 1 1 533 529 4 850
## 3 3 2013 1 1 542 540 2 923
## 4 4 2013 1 1 544 545 -1 1004
## 5 5 2013 1 1 554 600 -6 812
## 6 6 2013 1 1 554 558 -4 740
## 7 7 2013 1 1 555 600 -5 913
## 8 8 2013 1 1 557 600 -3 709
## 9 9 2013 1 1 557 600 -3 838
## 10 10 2013 1 1 558 600 -2 753
## # ... with 328,053 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# base R
subset(flights, !is.na(dep_time) & !is.na(arr_time))
## # A tibble: 328,063 x 20
## id year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 1 2013 1 1 517 515 2 830
## 2 2 2013 1 1 533 529 4 850
## 3 3 2013 1 1 542 540 2 923
## 4 4 2013 1 1 544 545 -1 1004
## 5 5 2013 1 1 554 600 -6 812
## 6 6 2013 1 1 554 558 -4 740
## 7 7 2013 1 1 555 600 -5 913
## 8 8 2013 1 1 557 600 -3 709
## 9 9 2013 1 1 557 600 -3 838
## 10 10 2013 1 1 558 600 -2 753
## # ... with 328,053 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
-- flights that have a departure delay sorted by delay select * from flights where dep_delay > 0 order by dep_delay desc
# dplyr
filter(flights, dep_delay > 0) %>%
arrange(desc(dep_delay))
## # A tibble: 128,432 x 20
## id year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 7073 2013 1 9 641 900 1301 1242
## 2 235779 2013 6 15 1432 1935 1137 1607
## 3 8240 2013 1 10 1121 1635 1126 1239
## 4 327044 2013 9 20 1139 1845 1014 1457
## 5 270377 2013 7 22 845 1600 1005 1044
## 6 173993 2013 4 10 1100 1900 960 1342
## 7 151975 2013 3 17 2321 810 911 135
## 8 247041 2013 6 27 959 1900 899 1236
## 9 270988 2013 7 22 2257 759 898 121
## 10 87239 2013 12 5 756 1700 896 1058
## # ... with 128,422 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# base R
df = subset(flights, dep_delay > 0)
df[order(desc(df$dep_delay)), ]
## # A tibble: 128,432 x 20
## id year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 7073 2013 1 9 641 900 1301 1242
## 2 235779 2013 6 15 1432 1935 1137 1607
## 3 8240 2013 1 10 1121 1635 1126 1239
## 4 327044 2013 9 20 1139 1845 1014 1457
## 5 270377 2013 7 22 845 1600 1005 1044
## 6 173993 2013 4 10 1100 1900 960 1342
## 7 151975 2013 3 17 2321 810 911 135
## 8 247041 2013 6 27 959 1900 899 1236
## 9 270988 2013 7 22 2257 759 898 121
## 10 87239 2013 12 5 756 1700 896 1058
## # ... with 128,422 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
-- flights that catched up during the flight sorted by catch up time select id, dep_delay, arr_delay, (dep_delay - arr_delay) as catchup from flights where catchup > 0 order by catchup desc
# dplyr
mutate(flights, catchup = dep_delay - arr_delay) %>%
select(id, dep_delay, arr_delay, catchup) %>%
filter(catchup > 0) %>%
arrange(desc(catchup))
## # A tibble: 221,565 x 4
## id dep_delay arr_delay catchup
## <int> <dbl> <dbl> <dbl>
## 1 234103 235 126 109
## 2 133682 60 -27 87
## 3 131144 206 126 80
## 4 205313 17 -62 79
## 5 134563 24 -52 76
## 6 263236 48 -26 74
## 7 266273 34 -40 74
## 8 107573 31 -42 73
## 9 195237 -2 -75 73
## 10 67835 9 -63 72
## # ... with 221,555 more rows
# base R
df = transform(flights, catchup = dep_delay - arr_delay)
df = subset(df, subset = (catchup > 0), select = c(id, dep_delay, arr_delay, catchup))
df = df[order(desc(df$catchup)), ]
head(df, 10)
## id dep_delay arr_delay catchup
## 234103 234103 235 126 109
## 133682 133682 60 -27 87
## 131144 131144 206 126 80
## 205313 205313 17 -62 79
## 134563 134563 24 -52 76
## 263236 263236 48 -26 74
## 266273 266273 34 -40 74
## 107573 107573 31 -42 73
## 195237 195237 -2 -75 73
## 67835 67835 9 -63 72
-- the number of flights per day select month, day, count(*) as count from flights group by month, day
# dplyr
group_by(flights, month, day) %>%
summarise(count = n())
## # A tibble: 365 x 3
## # Groups: month [?]
## month day count
## <int> <int> <int>
## 1 1 1 842
## 2 1 2 943
## 3 1 3 914
## 4 1 4 915
## 5 1 5 720
## 6 1 6 832
## 7 1 7 933
## 8 1 8 899
## 9 1 9 902
## 10 1 10 932
## # ... with 355 more rows
# or
count(flights, month, day)
## # A tibble: 365 x 3
## month day n
## <int> <int> <int>
## 1 1 1 842
## 2 1 2 943
## 3 1 3 914
## 4 1 4 915
## 5 1 5 720
## 6 1 6 832
## 7 1 7 933
## 8 1 8 899
## 9 1 9 902
## 10 1 10 932
## # ... with 355 more rows
# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = c("count")
head(df, 10)
## day month count
## 1 1 1 842
## 2 2 1 943
## 3 3 1 914
## 4 4 1 915
## 5 5 1 720
## 6 6 1 832
## 7 7 1 933
## 8 8 1 899
## 9 9 1 902
## 10 10 1 932
-- the busy days (with more than 1000 flights) select month, day, count(*) as count from flights group by month, day having count > 1000;
# dplyr
count(flights, month, day) %>%
filter(n > 1000)
## # A tibble: 14 x 3
## month day n
## <int> <int> <int>
## 1 7 8 1004
## 2 7 9 1001
## 3 7 10 1004
## 4 7 11 1006
## 5 7 12 1002
## 6 7 17 1001
## 7 7 18 1003
## 8 7 25 1003
## 9 7 31 1001
## 10 8 7 1001
## 11 8 8 1001
## 12 8 12 1001
## 13 11 27 1014
## 14 12 2 1004
# base R
df = aggregate(id ~ day + month, flights, FUN = length)
colnames(df)[3] = "count"
df = subset(df, count > 1000)
head(df, 10)
## day month count
## 189 8 7 1004
## 190 9 7 1001
## 191 10 7 1004
## 192 11 7 1006
## 193 12 7 1002
## 198 17 7 1001
## 199 18 7 1003
## 206 25 7 1003
## 212 31 7 1001
## 219 7 8 1001
-- the number of flights per destination select dest, count(*) as count from flights group by dest;
# dplyr
count(flights, dest)
## # A tibble: 105 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 265
## 3 ALB 439
## 4 ANC 8
## 5 ATL 17215
## 6 AUS 2439
## 7 AVL 275
## 8 BDL 443
## 9 BGR 375
## 10 BHM 297
## # ... with 95 more rows
# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
head(df, 10)
## dest count
## 1 ABQ 254
## 2 ACK 265
## 3 ALB 439
## 4 ANC 8
## 5 ATL 17215
## 6 AUS 2439
## 7 AVL 275
## 8 BDL 443
## 9 BGR 375
## 10 BHM 297
-- the popular destinations (with more than 365 flights) sorted by number of flights in descending order select dest, count(*) as count from flights group by dest having count > 365 order by count desc;
# dplyr
count(flights, dest) %>%
filter(n > 365) %>%
arrange(desc(n))
## # A tibble: 77 x 2
## dest n
## <chr> <int>
## 1 ORD 17283
## 2 ATL 17215
## 3 LAX 16174
## 4 BOS 15508
## 5 MCO 14082
## 6 CLT 14064
## 7 SFO 13331
## 8 FLL 12055
## 9 MIA 11728
## 10 DCA 9705
## # ... with 67 more rows
# base R
df = aggregate(id ~ dest, flights, FUN = length)
colnames(df)[2] = "count"
df = subset(df, count > 365)
df = df[order(desc(df$count)), ]
head(df, 10)
## dest count
## 70 ORD 17283
## 5 ATL 17215
## 50 LAX 16174
## 12 BOS 15508
## 55 MCO 14082
## 24 CLT 14064
## 91 SFO 13331
## 36 FLL 12055
## 59 MIA 11728
## 29 DCA 9705
-- the mean departure delay per day sorted in decreasing order of all flights on busy days of summer select month, day, count(*) as count, round(avg(dep_delay), 2) as avg_delay from flights where month = 6 or month = 7 or month = 8 group by month, day having count > 1000 order by avg_delay desc;
# dplyr
filter(flights, month %in% 6:8) %>%
group_by(month, day) %>%
summarise(n = n(), avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
filter(n > 1000) %>%
arrange(desc(avg_delay))
## # A tibble: 12 x 4
## # Groups: month [2]
## month day n avg_delay
## <int> <int> <int> <dbl>
## 1 7 10 1004 52.9
## 2 8 8 1001 43.3
## 3 7 8 1004 37.3
## 4 7 9 1001 30.7
## 5 7 12 1002 25.1
## 6 7 11 1006 23.6
## 7 7 18 1003 20.6
## 8 7 25 1003 19.7
## 9 7 17 1001 13.7
## 10 8 7 1001 8.68
## 11 8 12 1001 8.31
## 12 7 31 1001 6.28
# base R (stack overflow)
df = aggregate(dep_delay ~ day + month, flights,
FUN = function(x) cbind(count = length(x), avg_delay = mean(x, na.rm = TRUE)),
na.action = NULL)
# df$dep_delay is a matrix
str(df)
## 'data.frame': 365 obs. of 3 variables:
## $ day : int 1 2 3 4 5 6 7 8 9 10 ...
## $ month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_delay: num [1:365, 1:2] 842 943 914 915 720 832 933 899 902 932 ...
# unnest
df2 = cbind(df[,1:2], df[,3])
colnames(df2)[3:4] = c("count", "avg_delay")
# filter
df2 = subset(df2, count > 1000)
# order
df2 = df2[order(desc(df2$avg_delay)), ]
df2
## day month count avg_delay
## 191 10 7 1004 52.860702
## 220 8 8 1001 43.349947
## 189 8 7 1004 37.296646
## 190 9 7 1001 30.711499
## 193 12 7 1002 25.096154
## 192 11 7 1006 23.609392
## 199 18 7 1003 20.626789
## 206 25 7 1003 19.674134
## 331 27 11 1014 16.697651
## 198 17 7 1001 13.670707
## 336 2 12 1004 9.021978
## 219 7 8 1001 8.680402
## 224 12 8 1001 8.308157
## 212 31 7 1001 6.280843
-- flights that flew with a plane manufactured by BOEING select flights.id, flights.tailnum, planes.manufacturer from flights, planes where flights.tailnum = planes.tailnum and planes.manufacturer = "BOEING"
# dplyr
left_join(flights, planes, by="tailnum") %>%
select(id, tailnum, manufacturer) %>%
filter(manufacturer == "BOEING")
## # A tibble: 82,912 x 3
## id tailnum manufacturer
## <int> <chr> <chr>
## 1 1 N14228 BOEING
## 2 2 N24211 BOEING
## 3 3 N619AA BOEING
## 4 5 N668DN BOEING
## 5 6 N39463 BOEING
## 6 13 N29129 BOEING
## 7 14 N53441 BOEING
## 8 17 N76515 BOEING
## 9 23 N633AA BOEING
## 10 24 N3739P BOEING
## # ... with 82,902 more rows
# base R
df = merge(flights, planes, by="tailnum", all.x = TRUE)
df = subset(df, subset = (manufacturer == "BOEING"), select = c("id", "tailnum", "manufacturer"))
head(df, 10)
## id tailnum manufacturer
## 4426 220945 N11206 BOEING
## 4427 252368 N11206 BOEING
## 4428 12885 N11206 BOEING
## 4429 132376 N11206 BOEING
## 4430 302467 N11206 BOEING
## 4431 288196 N11206 BOEING
## 4432 169060 N11206 BOEING
## 4433 55825 N11206 BOEING
## 4434 195062 N11206 BOEING
## 4435 218017 N11206 BOEING
-- flights that flew to a destination with an altitude greater than 6000 feet sorted by altitude select flights.id, flights.dest, airports.name, airports.alt from flights, airports where flights.dest = airports.faa and airports.alt > 6000 order by airports.alt;
# dplyr
left_join(flights, airports, by=c("dest" = "faa")) %>%
select(id, dest, name, alt) %>%
filter(alt > 6000) %>%
arrange(alt)
## # A tibble: 253 x 4
## id dest name alt
## <int> <chr> <chr> <int>
## 1 153 JAC Jackson Hole Airport 6451
## 2 1068 JAC Jackson Hole Airport 6451
## 3 100067 JAC Jackson Hole Airport 6451
## 4 101062 JAC Jackson Hole Airport 6451
## 5 101913 JAC Jackson Hole Airport 6451
## 6 102007 JAC Jackson Hole Airport 6451
## 7 102805 JAC Jackson Hole Airport 6451
## 8 103800 JAC Jackson Hole Airport 6451
## 9 104658 JAC Jackson Hole Airport 6451
## 10 105490 JAC Jackson Hole Airport 6451
## # ... with 243 more rows
# base R
df = merge(flights, airports, by.x = "dest", by.y = "faa", all.x = TRUE)
df = subset(df, subset = (alt > 6000), select = c("id", "dest", "name", "alt"))
df = df[order(df$alt), ]
head(df, 10)
## id dest name alt
## 154976 153 JAC Jackson Hole Airport 6451
## 154977 157037 JAC Jackson Hole Airport 6451
## 154978 102007 JAC Jackson Hole Airport 6451
## 154979 131009 JAC Jackson Hole Airport 6451
## 154980 104658 JAC Jackson Hole Airport 6451
## 154981 118930 JAC Jackson Hole Airport 6451
## 154982 101062 JAC Jackson Hole Airport 6451
## 154983 108845 JAC Jackson Hole Airport 6451
## 154984 110520 JAC Jackson Hole Airport 6451
## 154985 108070 JAC Jackson Hole Airport 6451
-- flights that took off with a plane with 4 engines and a visibility lower than 3 miles select flights.id, planes.engines, weather.visib from flights, weather, planes where flights.month = weather.month and flights.day = weather.day and flights.hour = weather.hour and flights.origin = weather.origin and weather.visib < 3 and flights.tailnum = planes.tailnum and planes.engines = 4
# dplyr
select(flights, -time_hour) %>%
left_join(weather) %>%
left_join(planes, by = "tailnum") %>%
select(id, engines, visib) %>%
filter(visib < 3, engines == 4)
## Joining, by = c("year", "month", "day", "origin", "hour")
## # A tibble: 11 x 3
## id engines visib
## <int> <int> <dbl>
## 1 10066 4 2.5
## 2 11278 4 0.25
## 3 25295 4 0
## 4 120234 4 0.06
## 5 120640 4 0.12
## 6 120734 4 0.06
## 7 182566 4 0.12
## 8 210774 4 1.25
## 9 213173 4 0.25
## 10 227971 4 1
## 11 228191 4 2
# base R
df = subset(flights, select = -time_hour)
df = merge(df, weather, all.x = TRUE)
df = merge(df, planes, by = "tailnum", all.x = TRUE)
df = subset(df, subset = (visib < 3 & engines == 4), select = c("id", "engines", "visib"))
head(df, 10)
## id engines visib
## 120872 25295 4 0.00
## 235911 182566 4 0.12
## 292212 120734 4 0.06
## 292225 10066 4 2.50
## 292232 11278 4 0.25
## 292259 210774 4 1.25
## 292266 227971 4 1.00
## 292276 120640 4 0.12
## 292284 228191 4 2.00
## 292303 120234 4 0.06
-- flights with destination and origin airports with an altitude difference of more than 6000 feet select flights.id, airports2.alt, airports1.alt, (airports2.alt - airports1.alt) as altdelta from flights, airports as airports1, airports as airports2 where flights.origin = airports1.faa and flights.dest = airports2.faa and altdelta > 6000
# dplyr
left_join(flights, airports, by = c("origin" = "faa")) %>%
left_join(airports, by = c("dest" = "faa")) %>%
select(id, alt.x, alt.y) %>%
mutate(altdelta = alt.y - alt.x) %>%
filter(altdelta > 6000)
## # A tibble: 253 x 4
## id alt.x alt.y altdelta
## <int> <int> <int> <int>
## 1 153 18 6451 6433
## 2 194 18 6540 6522
## 3 571 13 6540 6527
## 4 1068 18 6451 6433
## 5 1095 18 6540 6522
## 6 1492 13 6540 6527
## 7 2052 18 6540 6522
## 8 2509 13 6540 6527
## 9 2975 18 6540 6522
## 10 3477 13 6540 6527
## # ... with 243 more rows
# base R
df = merge(flights, airports, by.x = "origin", by.y = "faa", all.x = TRUE)
df = merge(df, airports, by.x = "dest", by.y = "faa", all.x = TRUE)
df = subset(df, select = c("id", "alt.x", "alt.y"))
df = transform(df, altdelta = alt.y - alt.x)
df = subset(df, altdelta > 6000)
head(df, 10)
## id alt.x alt.y altdelta
## 121549 155570 13 6540 6527
## 121550 149990 13 6540 6527
## 121551 125764 13 6540 6527
## 121552 149018 13 6540 6527
## 121553 165081 18 6540 6522
## 121554 109847 18 6540 6522
## 121555 1095 18 6540 6522
## 121556 122845 18 6540 6522
## 121557 135554 18 6540 6522
## 121558 7264 18 6540 6522