We will experience querying a database with both SQL and dplyr:
## Load libraries
library(DBI)
library(dplyr)
library(dbplyr)
library(nycflights13)
# Connect to the database
nyc <- dbConnect(RSQLite::SQLite(), "nycflights13")
# Add surrogate key to flights
flights <-
flights %>%
arrange(year, month, day, sched_dep_time, carrier, flight) %>%
mutate(id = row_number()) %>%
select(id, everything())
# cure the violation of FKC
id_set_planes = anti_join(flights, planes, by = "tailnum")$id
id_set_airports = anti_join(flights, airports, by = c("dest" = "faa"))$id
flights =
mutate(flights, tailnum = ifelse(id %in% id_set_planes, NA, tailnum),
dest = ifelse(id %in% id_set_airports, NA, dest))
# write data frames into database tables (if necessary)
dbWriteTable(nyc, "flights", flights)
dbWriteTable(nyc, "airports", airports)
dbWriteTable(nyc, "planes", planes)
dbWriteTable(nyc, "weather", weather)
dbWriteTable(nyc, "airlines", airlines)
# list tables
dbListTables(nyc)
## [1] "airlines" "airports" "flights" "planes" "weather"
# list fields of a table
dbListFields(nyc, "flights")
## [1] "id" "year" "month" "day"
## [5] "dep_time" "sched_dep_time" "dep_delay" "arr_time"
## [9] "sched_arr_time" "arr_delay" "carrier" "flight"
## [13] "tailnum" "origin" "dest" "air_time"
## [17] "distance" "hour" "minute" "time_hour"
# flights on Christmas
flights_db <- tbl(nyc, "flights")
# dplyr
dplyr_res = flights_db %>%
filter(month == 12 & day == 25)
## SQL
querySQL =
"select *
from flights
where month = 12 and day = 25"
sql_res = dbGetQuery(nyc, querySQL)
# check if they are the same
dplyr_res = dplyr_res %>% collect()
dplyr_res
sql_res
# recall that for two sets A and B we have that: A = B iff A <= B and B <= A iff (A - B) = 0 and (B - A) = 0
setdiff(dplyr_res, sql_res)
setdiff(sql_res, dplyr_res)
# the number of flights per day
## dplyr
flights_db %>%
group_by(month, day) %>%
summarise(count = n())
## SQL
querySQL =
"select month, day, count(*) as count
from flights
group by month, day"
dbGetQuery(nyc, querySQL)
# flights that flew with a plane manufactured by BOEING
planes_db <- tbl(nyc, "planes")
## dplyr
inner_join(flights_db,
filter(planes_db, manufacturer == "BOEING"),
by="tailnum") %>%
select(id, tailnum, manufacturer)
## SQL
querySQL =
"select flights.id, flights.tailnum, planes.manufacturer
from flights, planes
where flights.tailnum = planes.tailnum and planes.manufacturer = 'BOEING'
"
dbGetQuery(nyc, querySQL)
# disconnect the database
dbDisconnect(nyc)
# remove the database (if necessary)
unlink("nycflights13")