One type of text that gets plenty of attention is text shared online via Twitter. Both of the authors of this book are on Twitter and are fairly regular users of it, so in this case study, let’s compare the entire Twitter archives of Julia and David.
This is a beginning-to-end analysis that demonstrates how to bring together the concepts and code we have been exploring in a cohesive way to understand a text data set. Comparing word frequencies allows us to see:
An individual can download their own Twitter archive by following directions available on Twitter’s website.
library(lubridate)
library(ggplot2)
library(dplyr)
library(readr)
tweets_julia <- read_csv("data/tweets_julia.csv")
tweets_dave <- read_csv("data/tweets_dave.csv")
tweets <-
bind_rows(tweets_julia %>% mutate(person = "Julia"),
tweets_dave %>% mutate(person = "David")) %>%
mutate(timestamp = ymd_hms(timestamp))
ggplot(tweets, aes(x = timestamp, fill = person)) +
geom_histogram(bins = 20, show.legend = FALSE) +
facet_wrap(~person, ncol = 1)
library(tidytext)
library(stringr)
# tokenize tweets
remove_reg <- "&|<|>"
tidy_tweets <- tweets %>%
filter(!str_detect(text, "^RT")) %>% # no retweets
mutate(text = str_remove_all(text, remove_reg)) %>% #not &, <, >
unnest_tokens(word, text) %>% #unnest using the specialized “tweets” tokenizer
filter(!word %in% stop_words$word, # no stop words
!word %in% str_remove_all(stop_words$word, "'"), # no abbreviated stop words (eg. dont, arent)
str_detect(word, "[a-z]")) # at least one letter
# calculate word frequencies for each person
frequency <- tidy_tweets %>%
group_by(person) %>%
count(word, sort = TRUE) %>%
left_join(tidy_tweets %>% # compute total number of words
group_by(person) %>%
summarise(total = n())) %>%
mutate(freq = n/total)
frequency
## # A tibble: 22,582 × 5
## # Groups: person [2]
## person word n total freq
## <chr> <chr> <int> <int> <dbl>
## 1 Julia t.co 1974 79701 0.0248
## 2 David t.co 1165 23477 0.0496
## 3 Julia http 1161 79701 0.0146
## 4 David https 992 23477 0.0423
## 5 Julia https 969 79701 0.0122
## 6 Julia time 585 79701 0.00734
## 7 Julia selkie1970 570 79701 0.00715
## 8 Julia skedman 531 79701 0.00666
## 9 Julia day 467 79701 0.00586
## 10 Julia baby 408 79701 0.00512
## # ℹ 22,572 more rows
library(tidyr)
# spread person
frequency <- frequency %>%
select(person, word, freq) %>%
spread(person, freq) %>%
arrange(Julia, David)
frequency
## # A tibble: 19,484 × 3
## word David Julia
## <chr> <dbl> <dbl>
## 1 1x 0.0000426 0.0000125
## 2 5k 0.0000426 0.0000125
## 3 accept 0.0000426 0.0000125
## 4 accidental__art 0.0000426 0.0000125
## 5 accidents 0.0000426 0.0000125
## 6 adam 0.0000426 0.0000125
## 7 adams 0.0000426 0.0000125
## 8 adapt 0.0000426 0.0000125
## 9 adapted 0.0000426 0.0000125
## 10 adaptive 0.0000426 0.0000125
## # ℹ 19,474 more rows
# compare words visually
library(scales)
ggplot(frequency, aes(Julia, David)) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.25, height = 0.25) + # jitter points
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) + # dont overlap labels
scale_x_log10(labels = percent_format()) + # log x scale
scale_y_log10(labels = percent_format()) + # log y scale
geom_abline(color = "red") # add bisector line
David has used his Twitter account almost exclusively for professional purposes since he became more active, while Julia used it for entirely personal purposes until late 2015 and still uses it more personally than David.
# filter tweets in 2016
tidy_tweets <- tidy_tweets %>%
filter(timestamp >= as.Date("2016-01-01"),
timestamp < as.Date("2017-01-01"))
We can calculate the log odds ratio for each word, using:
\[\mathrm{log\ odds\ ratio} = \log_2 \frac{\frac{n_D + 1}{t_D + 1}}{\frac{n_G + 1}{t_G + 1}} \]
where, for each word, \(n_D\) and \(n_G\) are the number of times David and Giulia, respectivally, used the work, while \(t_D\) and \(t_G\) are the total number of words used by David and Giulia, respectivally.
# compute word log odd ratio
word_ratios <- tidy_tweets %>%
filter(!str_detect(word, "^@")) %>%
count(word, person) %>%
group_by(word) %>%
filter(sum(n) >= 10) %>%
ungroup() %>%
spread(person, n, fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(David / Julia)) %>%
arrange(desc(logratio))
# Julia words
word_ratios %>%
arrange(logratio)
## # A tibble: 475 × 4
## word David Julia logratio
## <chr> <dbl> <dbl> <dbl>
## 1 jowanza 0.000126 0.00906 -4.27
## 2 skedman 0.000126 0.00804 -4.16
## 3 nicole_cliffe 0.000126 0.00716 -4.04
## 4 omg 0.000126 0.00585 -3.84
## 5 secretbeck 0.000126 0.00497 -3.67
## 6 utah 0.000126 0.00482 -3.64
## 7 jkru 0.000126 0.00395 -3.44
## 8 selkie1970 0.000126 0.00380 -3.41
## 9 gosh 0.000126 0.00336 -3.28
## 10 slc 0.000126 0.00322 -3.24
## # ℹ 465 more rows
# David words
word_ratios %>%
arrange(-logratio)
## # A tibble: 475 × 4
## word David Julia logratio
## <chr> <dbl> <dbl> <dbl>
## 1 juliasilge 0.0161 0.000146 4.70
## 2 jtleek 0.00858 0.000146 4.07
## 3 e.g 0.00668 0.000146 3.82
## 4 user2016 0.00429 0.000146 3.38
## 5 jsm2016 0.00416 0.000146 3.35
## 6 ucfagls 0.00416 0.000146 3.35
## 7 android 0.00328 0.000146 3.11
## 8 traffic 0.00315 0.000146 3.07
## 9 news3jessica 0.00303 0.000146 3.03
## 10 dev 0.00290 0.000146 2.99
## # ℹ 465 more rows
# What are some words that have been about equally likely to come from David or Julia’s account during 2016?
word_ratios %>%
arrange(abs(logratio))
## # A tibble: 475 × 4
## word David Julia logratio
## <chr> <dbl> <dbl> <dbl>
## 1 hadley 0.000883 0.000877 0.00641
## 2 team 0.000883 0.000877 0.00641
## 3 writing 0.00177 0.00175 0.00641
## 4 https 0.109 0.110 -0.00792
## 5 t.co 0.109 0.110 -0.00792
## 6 account 0.00101 0.00102 -0.0142
## 7 api 0.00101 0.00102 -0.0142
## 8 function 0.00202 0.00205 -0.0142
## 9 population 0.00101 0.00102 -0.0142
## 10 rmflight 0.00101 0.00102 -0.0142
## # ℹ 465 more rows
# Which words are most likely to be from Julia’s account or from David’s account?
word_ratios %>%
group_by(logratio < 0) %>%
top_n(15, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (David/Julia)") +
scale_fill_discrete(name = "", labels = c("David", "Julia"))
The section above looked at overall word use, but now let’s ask a different question.
words_by_time <- tidy_tweets %>%
filter(!str_detect(word, "^@")) %>%
mutate(time_floor = floor_date(timestamp, unit = "1 month")) %>% # unit of time is one month
count(time_floor, person, word) %>% # number of times a word is used by that person in that time unit
group_by(person, time_floor) %>%
mutate(time_total = sum(n)) %>% # total number of words used by that person in that time unit
group_by(person, word) %>%
mutate(word_total = sum(n)) %>% # total number of times that word is used by that person
ungroup() %>%
rename(count = n) %>%
filter(word_total > 30)
words_by_time
## # A tibble: 632 × 6
## time_floor person word count time_total word_total
## <dttm> <chr> <chr> <int> <int> <int>
## 1 2016-01-01 00:00:00 David broom 2 420 39
## 2 2016-01-01 00:00:00 David data 2 420 155
## 3 2016-01-01 00:00:00 David ggplot2 1 420 39
## 4 2016-01-01 00:00:00 David hadleywickham 3 420 211
## 5 2016-01-01 00:00:00 David hspter 7 420 95
## 6 2016-01-01 00:00:00 David https 12 420 865
## 7 2016-01-01 00:00:00 David jasonpunyon 1 420 56
## 8 2016-01-01 00:00:00 David jennybryan 1 420 110
## 9 2016-01-01 00:00:00 David quominus 13 420 67
## 10 2016-01-01 00:00:00 David rallidaerule 6 420 40
## # ℹ 622 more rows
# nest data
nested_data <- words_by_time %>%
nest(-word, -person)
nested_data
## # A tibble: 64 × 3
## person word data
## <chr> <chr> <list>
## 1 David broom <tibble [10 × 4]>
## 2 David data <tibble [12 × 4]>
## 3 David ggplot2 <tibble [10 × 4]>
## 4 David hadleywickham <tibble [12 × 4]>
## 5 David hspter <tibble [11 × 4]>
## 6 David https <tibble [12 × 4]>
## 7 David jasonpunyon <tibble [12 × 4]>
## 8 David jennybryan <tibble [12 × 4]>
## 9 David quominus <tibble [11 × 4]>
## 10 David rallidaerule <tibble [8 × 4]>
## # ℹ 54 more rows
nested_data$data[1]
## [[1]]
## # A tibble: 10 × 4
## time_floor count time_total word_total
## <dttm> <int> <int> <int>
## 1 2016-01-01 00:00:00 2 420 39
## 2 2016-02-01 00:00:00 4 1464 39
## 3 2016-03-01 00:00:00 4 1419 39
## 4 2016-04-01 00:00:00 6 1562 39
## 5 2016-06-01 00:00:00 4 1127 39
## 6 2016-07-01 00:00:00 4 1069 39
## 7 2016-08-01 00:00:00 6 2839 39
## 8 2016-09-01 00:00:00 4 1788 39
## 9 2016-11-01 00:00:00 3 1776 39
## 10 2016-12-01 00:00:00 2 996 39
We use a generalized linear model (glm) with binomial error distribution and a log-odds (or logit) link function. Notice that a typical formula in an R model has the form response ~ terms where response is the (numeric) response vector and terms is a series of terms which specifies a linear predictor for response. For binomial families the response can also be specified as a two-column matrix with the columns giving the numbers of successes and failures.
library(purrr)
nested_models <-
nested_data %>%
mutate(models = map(data, ~ glm(cbind(count, time_total) ~ time_floor, ., family = "binomial")))
nested_models$models[1]
## [[1]]
##
## Call: glm(formula = cbind(count, time_total) ~ time_floor, family = "binomial",
## data = .)
##
## Coefficients:
## (Intercept) time_floor
## 2.97e+01 -2.43e-08
##
## Degrees of Freedom: 9 Total (i.e. Null); 8 Residual
## Null Deviance: 3.371
## Residual Deviance: 1.684 AIC: 37.52
library(broom)
# pull out the slopes for each of these models
slopes <- nested_models %>%
mutate(models = map(models, tidy)) %>%
unnest(cols = c(models)) %>%
filter(term == "time_floor") %>%
# We are comparing many slopes here and some of them are not statistically significant,
# so let’s apply an adjustment to the p-values for multiple comparisons.
mutate(adjusted.p.value = p.adjust(p.value))
# important slopes
top_slopes <- slopes %>%
filter(adjusted.p.value < 0.05)
top_slopes
## # A tibble: 11 × 9
## person word data term estimate std.error statistic p.value
## <chr> <chr> <list> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 David ggplot2 <tibble> time_floor -8.73e-8 1.98e-8 -4.42 9.98e- 6
## 2 David hspter <tibble> time_floor -5.67e-8 1.22e-8 -4.64 3.42e- 6
## 3 David https <tibble> time_floor 2.33e-8 4.28e-9 5.45 5.11e- 8
## 4 David quominus <tibble> time_floor -5.35e-8 1.56e-8 -3.42 6.25e- 4
## 5 David t.co <tibble> time_floor 2.33e-8 4.28e-9 5.45 5.11e- 8
## 6 David treycausey <tibble> time_floor -1.52e-7 3.02e-8 -5.05 4.37e- 7
## 7 Julia post <tibble> time_floor -5.06e-8 1.45e-8 -3.49 4.86e- 4
## 8 Julia rstats <tibble> time_floor -4.80e-8 1.11e-8 -4.32 1.58e- 5
## 9 David jtleek <tibble> time_floor -1.24e-7 1.88e-8 -6.60 4.20e-11
## 10 David stack <tibble> time_floor 7.68e-8 2.20e-8 3.48 4.93e- 4
## 11 David user2016 <tibble> time_floor -8.31e-7 1.55e-7 -5.35 9.03e- 8
## # ℹ 1 more variable: adjusted.p.value <dbl>
# Which words have changed in frequency at a moderately significant level in our tweets?
words_by_time %>%
inner_join(top_slopes, by = c("word", "person")) %>%
filter(person == "David") %>%
ggplot(aes(time_floor, count/time_total, color = word)) +
geom_line(size = 1.3) +
labs(x = NULL, y = "Word frequency")
words_by_time %>%
inner_join(top_slopes, by = c("word", "person")) %>%
filter(person == "Julia") %>%
ggplot(aes(time_floor, count/time_total, color = word)) +
geom_line(size = 1.3) +
labs(x = NULL, y = "Word frequency")
When a user downloads their own Twitter archive, favorites and retweets are not included, so we constructed another dataset of the authors’ tweets that includes this information. We accessed our own tweets via the Twitter API and downloaded about 3200 tweets for each person.
tweets_julia <- read_csv("data/juliasilge_tweets.csv")
tweets_dave <- read_csv("data/drob_tweets.csv")
tweets <-
bind_rows(tweets_julia %>% mutate(person = "Julia"),
tweets_dave %>% mutate(person = "David")) %>%
mutate(created_at = ymd_hms(created_at))
tweets
## # A tibble: 6,410 × 7
## id created_at source retweets favorites text person
## <dbl> <dttm> <chr> <dbl> <dbl> <chr> <chr>
## 1 8.04e17 2016-12-01 18:09:04 Twitter Web Clie… 67 0 "RT … Julia
## 2 8.04e17 2016-12-01 18:07:37 Twitter Web Clie… 1 0 "RT … Julia
## 3 8.04e17 2016-12-01 16:44:03 Twitter Web Clie… 0 0 "My … Julia
## 4 8.04e17 2016-12-01 16:42:03 Twitter Web Clie… 0 9 "It'… Julia
## 5 8.04e17 2016-12-01 13:11:37 Twitter for iPho… 0 1 "@da… Julia
## 6 8.04e17 2016-12-01 02:57:15 Twitter Web Clie… 0 2 "@jk… Julia
## 7 8.04e17 2016-12-01 02:56:10 Twitter Web Clie… 0 11 "Was… Julia
## 8 8.04e17 2016-11-30 18:55:59 Twitter Web Clie… 0 2 "@Je… Julia
## 9 8.04e17 2016-11-30 18:41:46 Twitter Web Clie… 0 2 "@Jo… Julia
## 10 8.04e17 2016-11-30 18:40:27 Twitter Web Clie… 0 17 "Am … Julia
## # ℹ 6,400 more rows
# clean data
tidy_tweets <- tweets %>%
filter(!str_detect(text, "^(RT|@)")) %>%
mutate(text = str_remove_all(text, remove_reg)) %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word,
!word %in% str_remove_all(stop_words$word, "'"))
# Notice that the word column contains tokenized emoji
tidy_tweets
## # A tibble: 14,343 × 7
## id created_at source retweets favorites person word
## <dbl> <dttm> <chr> <dbl> <dbl> <chr> <chr>
## 1 8.04e17 2016-12-01 16:44:03 Twitter Web Clie… 0 0 Julia score
## 2 8.04e17 2016-12-01 16:44:03 Twitter Web Clie… 0 0 Julia 50
## 3 8.04e17 2016-12-01 16:44:03 Twitter Web Clie… 0 0 Julia https
## 4 8.04e17 2016-12-01 16:44:03 Twitter Web Clie… 0 0 Julia t.co
## 5 8.04e17 2016-12-01 16:44:03 Twitter Web Clie… 0 0 Julia 9iyp…
## 6 8.04e17 2016-12-01 16:42:03 Twitter Web Clie… 0 9 Julia snow…
## 7 8.04e17 2016-12-01 16:42:03 Twitter Web Clie… 0 9 Julia drin…
## 8 8.04e17 2016-12-01 16:42:03 Twitter Web Clie… 0 9 Julia tea
## 9 8.04e17 2016-12-01 16:42:03 Twitter Web Clie… 0 9 Julia rsta…
## 10 8.04e17 2016-12-01 02:56:10 Twitter Web Clie… 0 11 Julia julie
## # ℹ 14,333 more rows
# total number of retweets for each person
totals <- tidy_tweets %>%
group_by(person, id) %>%
summarise(rts = first(retweets)) %>% #count retweets only once, not for every word
group_by(person) %>%
summarise(total_rts = sum(rts))
totals
## # A tibble: 2 × 2
## person total_rts
## <chr> <dbl>
## 1 David 13022
## 2 Julia 1751
# median number of retweets for each word and person
word_by_rts <- tidy_tweets %>%
group_by(id, word, person) %>%
summarise(rts = first(retweets)) %>% #count retweets only once, not for every repetition of a word
group_by(person, word) %>%
summarise(retweets = median(rts), uses = n()) %>%
left_join(totals) %>%
filter(retweets != 0) %>%
ungroup()
word_by_rts %>%
filter(uses >= 5) %>%
arrange(desc(retweets))
## # A tibble: 185 × 5
## person word retweets uses total_rts
## <chr> <chr> <dbl> <int> <dbl>
## 1 David animation 85 5 13022
## 2 David stackoverflow 56 5 13022
## 3 David download 52 5 13022
## 4 David start 51 7 13022
## 5 Julia tidytext 50 7 1751
## 6 David gganimate 45 8 13022
## 7 David introducing 45 6 13022
## 8 David understanding 37 6 13022
## 9 David error 34.5 8 13022
## 10 David bayesian 34 7 13022
## # ℹ 175 more rows
# visualize most retweeted words
word_by_rts %>%
filter(uses >= 5) %>%
group_by(person) %>%
top_n(10, retweets) %>%
arrange(retweets) %>%
ungroup() %>%
mutate(word = factor(word, unique(word))) %>%
ungroup() %>%
ggplot(aes(word, retweets, fill = person)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ person, scales = "free", ncol = 2) +
coord_flip() +
labs(x = NULL,
y = "Median # of retweets for tweets containing each word")
# same for fovourites
totals <- tidy_tweets %>%
group_by(person, id) %>%
summarise(favs = first(favorites)) %>%
group_by(person) %>%
summarise(total_favs = sum(favs))
word_by_favs <- tidy_tweets %>%
group_by(id, word, person) %>%
summarise(favs = first(favorites)) %>%
group_by(person, word) %>%
summarise(favorites = median(favs), uses = n()) %>%
left_join(totals) %>%
filter(favorites != 0) %>%
ungroup()
word_by_favs %>%
filter(uses >= 5) %>%
group_by(person) %>%
top_n(10, favorites) %>%
arrange(favorites) %>%
ungroup() %>%
mutate(word = factor(word, unique(word))) %>%
ungroup() %>%
ggplot(aes(word, favorites, fill = person)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ person, scales = "free", ncol = 2) +
coord_flip() +
labs(x = NULL,
y = "Median # of favorites for tweets containing each word")