One type of text that gets plenty of attention is text shared online via Twitter. Both of the authors of this book are on Twitter and are fairly regular users of it, so in this case study, let’s compare the entire Twitter archives of Julia and David.

This is a beginning-to-end analysis that demonstrates how to bring together the concepts and code we have been exploring in a cohesive way to understand a text data set. Comparing word frequencies allows us to see:

Getting the data and distribution of tweets

An individual can download their own Twitter archive by following directions available on Twitter’s website.

library(lubridate)
library(ggplot2)
library(dplyr)
library(readr)

tweets_julia <- read_csv("data/tweets_julia.csv")
tweets_dave <- read_csv("data/tweets_dave.csv")
tweets <- 
  bind_rows(tweets_julia %>% mutate(person = "Julia"),
            tweets_dave %>% mutate(person = "David")) %>%
  mutate(timestamp = ymd_hms(timestamp))

ggplot(tweets, aes(x = timestamp, fill = person)) +
  geom_histogram(bins = 20, show.legend = FALSE) +
  facet_wrap(~person, ncol = 1)

Word frequencies

library(tidytext)
library(stringr)

# tokenize tweets
remove_reg <- "&amp;|&lt;|&gt;"
tidy_tweets <- tweets %>% 
  filter(!str_detect(text, "^RT")) %>% # no retweets
  mutate(text = str_remove_all(text, remove_reg)) %>% #not &, <, >
  unnest_tokens(word, text) %>% #unnest using the specialized “tweets” tokenizer 
  filter(!word %in% stop_words$word, # no stop words
         !word %in% str_remove_all(stop_words$word, "'"), # no abbreviated stop words (eg. dont, arent)
         str_detect(word, "[a-z]")) # at least one letter 
# calculate word frequencies for each person
frequency <- tidy_tweets %>% 
  group_by(person) %>% 
  count(word, sort = TRUE) %>% 
  left_join(tidy_tweets %>% # compute total number of words
              group_by(person) %>% 
              summarise(total = n())) %>%
  mutate(freq = n/total)

frequency
## # A tibble: 22,582 × 5
## # Groups:   person [2]
##    person word           n total    freq
##    <chr>  <chr>      <int> <int>   <dbl>
##  1 Julia  t.co        1974 79701 0.0248 
##  2 David  t.co        1165 23477 0.0496 
##  3 Julia  http        1161 79701 0.0146 
##  4 David  https        992 23477 0.0423 
##  5 Julia  https        969 79701 0.0122 
##  6 Julia  time         585 79701 0.00734
##  7 Julia  selkie1970   570 79701 0.00715
##  8 Julia  skedman      531 79701 0.00666
##  9 Julia  day          467 79701 0.00586
## 10 Julia  baby         408 79701 0.00512
## # ℹ 22,572 more rows
library(tidyr)

# spread person
frequency <- frequency %>% 
  select(person, word, freq) %>% 
  spread(person, freq) %>%
  arrange(Julia, David)

frequency
## # A tibble: 19,484 × 3
##    word                David     Julia
##    <chr>               <dbl>     <dbl>
##  1 1x              0.0000426 0.0000125
##  2 5k              0.0000426 0.0000125
##  3 accept          0.0000426 0.0000125
##  4 accidental__art 0.0000426 0.0000125
##  5 accidents       0.0000426 0.0000125
##  6 adam            0.0000426 0.0000125
##  7 adams           0.0000426 0.0000125
##  8 adapt           0.0000426 0.0000125
##  9 adapted         0.0000426 0.0000125
## 10 adaptive        0.0000426 0.0000125
## # ℹ 19,474 more rows
# compare words visually
library(scales)

ggplot(frequency, aes(Julia, David)) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.25, height = 0.25) + # jitter points
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) + # dont overlap labels
  scale_x_log10(labels = percent_format()) + # log x scale
  scale_y_log10(labels = percent_format()) + # log y scale
  geom_abline(color = "red") # add bisector line

David has used his Twitter account almost exclusively for professional purposes since he became more active, while Julia used it for entirely personal purposes until late 2015 and still uses it more personally than David.

Comparing word usage

# filter tweets in 2016
tidy_tweets <- tidy_tweets %>%
  filter(timestamp >= as.Date("2016-01-01"),
         timestamp < as.Date("2017-01-01"))

We can calculate the log odds ratio for each word, using:

\[\mathrm{log\ odds\ ratio} = \log_2 \frac{\frac{n_D + 1}{t_D + 1}}{\frac{n_G + 1}{t_G + 1}} \]

where, for each word, \(n_D\) and \(n_G\) are the number of times David and Giulia, respectivally, used the work, while \(t_D\) and \(t_G\) are the total number of words used by David and Giulia, respectivally.

# compute word log odd ratio
word_ratios <- tidy_tweets %>%
  filter(!str_detect(word, "^@")) %>%
  count(word, person) %>%
  group_by(word) %>%
  filter(sum(n) >= 10) %>%
  ungroup() %>%
  spread(person, n, fill = 0) %>%
  mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
  mutate(logratio = log(David / Julia)) %>%
  arrange(desc(logratio))
# Julia words
word_ratios %>% 
  arrange(logratio)
## # A tibble: 475 × 4
##    word             David   Julia logratio
##    <chr>            <dbl>   <dbl>    <dbl>
##  1 jowanza       0.000126 0.00906    -4.27
##  2 skedman       0.000126 0.00804    -4.16
##  3 nicole_cliffe 0.000126 0.00716    -4.04
##  4 omg           0.000126 0.00585    -3.84
##  5 secretbeck    0.000126 0.00497    -3.67
##  6 utah          0.000126 0.00482    -3.64
##  7 jkru          0.000126 0.00395    -3.44
##  8 selkie1970    0.000126 0.00380    -3.41
##  9 gosh          0.000126 0.00336    -3.28
## 10 slc           0.000126 0.00322    -3.24
## # ℹ 465 more rows
# David words
word_ratios %>% 
  arrange(-logratio)
## # A tibble: 475 × 4
##    word           David    Julia logratio
##    <chr>          <dbl>    <dbl>    <dbl>
##  1 juliasilge   0.0161  0.000146     4.70
##  2 jtleek       0.00858 0.000146     4.07
##  3 e.g          0.00668 0.000146     3.82
##  4 user2016     0.00429 0.000146     3.38
##  5 jsm2016      0.00416 0.000146     3.35
##  6 ucfagls      0.00416 0.000146     3.35
##  7 android      0.00328 0.000146     3.11
##  8 traffic      0.00315 0.000146     3.07
##  9 news3jessica 0.00303 0.000146     3.03
## 10 dev          0.00290 0.000146     2.99
## # ℹ 465 more rows
# What are some words that have been about equally likely to come from David or Julia’s account during 2016?
word_ratios %>% 
  arrange(abs(logratio))
## # A tibble: 475 × 4
##    word          David    Julia logratio
##    <chr>         <dbl>    <dbl>    <dbl>
##  1 hadley     0.000883 0.000877  0.00641
##  2 team       0.000883 0.000877  0.00641
##  3 writing    0.00177  0.00175   0.00641
##  4 https      0.109    0.110    -0.00792
##  5 t.co       0.109    0.110    -0.00792
##  6 account    0.00101  0.00102  -0.0142 
##  7 api        0.00101  0.00102  -0.0142 
##  8 function   0.00202  0.00205  -0.0142 
##  9 population 0.00101  0.00102  -0.0142 
## 10 rmflight   0.00101  0.00102  -0.0142 
## # ℹ 465 more rows
# Which words are most likely to be from Julia’s account or from David’s account?
word_ratios %>%
  group_by(logratio < 0) %>%
  top_n(15, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  ylab("log odds ratio (David/Julia)") +
  scale_fill_discrete(name = "", labels = c("David", "Julia"))

Changes in word use

The section above looked at overall word use, but now let’s ask a different question.

words_by_time <- tidy_tweets %>%
  filter(!str_detect(word, "^@")) %>%
  mutate(time_floor = floor_date(timestamp, unit = "1 month")) %>% # unit of time is one month
  count(time_floor, person, word) %>% # number of times a word is used by that person in that time unit
  group_by(person, time_floor) %>%
  mutate(time_total = sum(n)) %>% # total number of words used by that person in that time unit
  group_by(person, word) %>%
  mutate(word_total = sum(n)) %>% # total number of times that word is used by that person
  ungroup() %>%
  rename(count = n) %>%
  filter(word_total > 30)

words_by_time
## # A tibble: 632 × 6
##    time_floor          person word          count time_total word_total
##    <dttm>              <chr>  <chr>         <int>      <int>      <int>
##  1 2016-01-01 00:00:00 David  broom             2        420         39
##  2 2016-01-01 00:00:00 David  data              2        420        155
##  3 2016-01-01 00:00:00 David  ggplot2           1        420         39
##  4 2016-01-01 00:00:00 David  hadleywickham     3        420        211
##  5 2016-01-01 00:00:00 David  hspter            7        420         95
##  6 2016-01-01 00:00:00 David  https            12        420        865
##  7 2016-01-01 00:00:00 David  jasonpunyon       1        420         56
##  8 2016-01-01 00:00:00 David  jennybryan        1        420        110
##  9 2016-01-01 00:00:00 David  quominus         13        420         67
## 10 2016-01-01 00:00:00 David  rallidaerule      6        420         40
## # ℹ 622 more rows
# nest data
nested_data <- words_by_time %>%
  nest(-word, -person) 

nested_data
## # A tibble: 64 × 3
##    person word          data             
##    <chr>  <chr>         <list>           
##  1 David  broom         <tibble [10 × 4]>
##  2 David  data          <tibble [12 × 4]>
##  3 David  ggplot2       <tibble [10 × 4]>
##  4 David  hadleywickham <tibble [12 × 4]>
##  5 David  hspter        <tibble [11 × 4]>
##  6 David  https         <tibble [12 × 4]>
##  7 David  jasonpunyon   <tibble [12 × 4]>
##  8 David  jennybryan    <tibble [12 × 4]>
##  9 David  quominus      <tibble [11 × 4]>
## 10 David  rallidaerule  <tibble [8 × 4]> 
## # ℹ 54 more rows
nested_data$data[1]
## [[1]]
## # A tibble: 10 × 4
##    time_floor          count time_total word_total
##    <dttm>              <int>      <int>      <int>
##  1 2016-01-01 00:00:00     2        420         39
##  2 2016-02-01 00:00:00     4       1464         39
##  3 2016-03-01 00:00:00     4       1419         39
##  4 2016-04-01 00:00:00     6       1562         39
##  5 2016-06-01 00:00:00     4       1127         39
##  6 2016-07-01 00:00:00     4       1069         39
##  7 2016-08-01 00:00:00     6       2839         39
##  8 2016-09-01 00:00:00     4       1788         39
##  9 2016-11-01 00:00:00     3       1776         39
## 10 2016-12-01 00:00:00     2        996         39

We use a generalized linear model (glm) with binomial error distribution and a log-odds (or logit) link function. Notice that a typical formula in an R model has the form response ~ terms where response is the (numeric) response vector and terms is a series of terms which specifies a linear predictor for response. For binomial families the response can also be specified as a two-column matrix with the columns giving the numbers of successes and failures.

library(purrr)

nested_models <- 
  nested_data %>%
  mutate(models = map(data, ~ glm(cbind(count, time_total) ~ time_floor, ., family = "binomial")))

nested_models$models[1]
## [[1]]
## 
## Call:  glm(formula = cbind(count, time_total) ~ time_floor, family = "binomial", 
##     data = .)
## 
## Coefficients:
## (Intercept)   time_floor  
##    2.97e+01    -2.43e-08  
## 
## Degrees of Freedom: 9 Total (i.e. Null);  8 Residual
## Null Deviance:       3.371 
## Residual Deviance: 1.684     AIC: 37.52
library(broom)

# pull out the slopes for each of these models
slopes <- nested_models %>%
  mutate(models = map(models, tidy)) %>% 
  unnest(cols = c(models)) %>%
  filter(term == "time_floor") %>%
  # We are comparing many slopes here and some of them are not statistically significant, 
  # so let’s apply an adjustment to the p-values for multiple comparisons.
  mutate(adjusted.p.value = p.adjust(p.value)) 

# important slopes
top_slopes <- slopes %>% 
  filter(adjusted.p.value < 0.05)

top_slopes
## # A tibble: 11 × 9
##    person word       data     term         estimate std.error statistic  p.value
##    <chr>  <chr>      <list>   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
##  1 David  ggplot2    <tibble> time_floor   -8.73e-8   1.98e-8     -4.42 9.98e- 6
##  2 David  hspter     <tibble> time_floor   -5.67e-8   1.22e-8     -4.64 3.42e- 6
##  3 David  https      <tibble> time_floor    2.33e-8   4.28e-9      5.45 5.11e- 8
##  4 David  quominus   <tibble> time_floor   -5.35e-8   1.56e-8     -3.42 6.25e- 4
##  5 David  t.co       <tibble> time_floor    2.33e-8   4.28e-9      5.45 5.11e- 8
##  6 David  treycausey <tibble> time_floor   -1.52e-7   3.02e-8     -5.05 4.37e- 7
##  7 Julia  post       <tibble> time_floor   -5.06e-8   1.45e-8     -3.49 4.86e- 4
##  8 Julia  rstats     <tibble> time_floor   -4.80e-8   1.11e-8     -4.32 1.58e- 5
##  9 David  jtleek     <tibble> time_floor   -1.24e-7   1.88e-8     -6.60 4.20e-11
## 10 David  stack      <tibble> time_floor    7.68e-8   2.20e-8      3.48 4.93e- 4
## 11 David  user2016   <tibble> time_floor   -8.31e-7   1.55e-7     -5.35 9.03e- 8
## # ℹ 1 more variable: adjusted.p.value <dbl>
# Which words have changed in frequency at a moderately significant level in our tweets?
words_by_time %>%
  inner_join(top_slopes, by = c("word", "person")) %>%
  filter(person == "David") %>%
  ggplot(aes(time_floor, count/time_total, color = word)) +
  geom_line(size = 1.3) +
  labs(x = NULL, y = "Word frequency")

words_by_time %>%
  inner_join(top_slopes, by = c("word", "person")) %>%
  filter(person == "Julia") %>%
  ggplot(aes(time_floor, count/time_total, color = word)) +
  geom_line(size = 1.3) +
  labs(x = NULL, y = "Word frequency")

Favorites and retweets

When a user downloads their own Twitter archive, favorites and retweets are not included, so we constructed another dataset of the authors’ tweets that includes this information. We accessed our own tweets via the Twitter API and downloaded about 3200 tweets for each person.

tweets_julia <- read_csv("data/juliasilge_tweets.csv")
tweets_dave <- read_csv("data/drob_tweets.csv")

tweets <- 
  bind_rows(tweets_julia %>% mutate(person = "Julia"),
            tweets_dave %>% mutate(person = "David")) %>%
  mutate(created_at = ymd_hms(created_at))

tweets
## # A tibble: 6,410 × 7
##         id created_at          source            retweets favorites text  person
##      <dbl> <dttm>              <chr>                <dbl>     <dbl> <chr> <chr> 
##  1 8.04e17 2016-12-01 18:09:04 Twitter Web Clie…       67         0 "RT … Julia 
##  2 8.04e17 2016-12-01 18:07:37 Twitter Web Clie…        1         0 "RT … Julia 
##  3 8.04e17 2016-12-01 16:44:03 Twitter Web Clie…        0         0 "My … Julia 
##  4 8.04e17 2016-12-01 16:42:03 Twitter Web Clie…        0         9 "It'… Julia 
##  5 8.04e17 2016-12-01 13:11:37 Twitter for iPho…        0         1 "@da… Julia 
##  6 8.04e17 2016-12-01 02:57:15 Twitter Web Clie…        0         2 "@jk… Julia 
##  7 8.04e17 2016-12-01 02:56:10 Twitter Web Clie…        0        11 "Was… Julia 
##  8 8.04e17 2016-11-30 18:55:59 Twitter Web Clie…        0         2 "@Je… Julia 
##  9 8.04e17 2016-11-30 18:41:46 Twitter Web Clie…        0         2 "@Jo… Julia 
## 10 8.04e17 2016-11-30 18:40:27 Twitter Web Clie…        0        17 "Am … Julia 
## # ℹ 6,400 more rows
# clean data
tidy_tweets <- tweets %>% 
  filter(!str_detect(text, "^(RT|@)")) %>%
  mutate(text = str_remove_all(text, remove_reg)) %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word,
         !word %in% str_remove_all(stop_words$word, "'"))

# Notice that the word column contains tokenized emoji
tidy_tweets
## # A tibble: 14,343 × 7
##         id created_at          source            retweets favorites person word 
##      <dbl> <dttm>              <chr>                <dbl>     <dbl> <chr>  <chr>
##  1 8.04e17 2016-12-01 16:44:03 Twitter Web Clie…        0         0 Julia  score
##  2 8.04e17 2016-12-01 16:44:03 Twitter Web Clie…        0         0 Julia  50   
##  3 8.04e17 2016-12-01 16:44:03 Twitter Web Clie…        0         0 Julia  https
##  4 8.04e17 2016-12-01 16:44:03 Twitter Web Clie…        0         0 Julia  t.co 
##  5 8.04e17 2016-12-01 16:44:03 Twitter Web Clie…        0         0 Julia  9iyp…
##  6 8.04e17 2016-12-01 16:42:03 Twitter Web Clie…        0         9 Julia  snow…
##  7 8.04e17 2016-12-01 16:42:03 Twitter Web Clie…        0         9 Julia  drin…
##  8 8.04e17 2016-12-01 16:42:03 Twitter Web Clie…        0         9 Julia  tea  
##  9 8.04e17 2016-12-01 16:42:03 Twitter Web Clie…        0         9 Julia  rsta…
## 10 8.04e17 2016-12-01 02:56:10 Twitter Web Clie…        0        11 Julia  julie
## # ℹ 14,333 more rows
# total number of retweets for each person
totals <- tidy_tweets %>% 
  group_by(person, id) %>% 
  summarise(rts = first(retweets)) %>% #count retweets only once, not for every word
  group_by(person) %>% 
  summarise(total_rts = sum(rts))

totals
## # A tibble: 2 × 2
##   person total_rts
##   <chr>      <dbl>
## 1 David      13022
## 2 Julia       1751
# median number of retweets for each word and person
word_by_rts <- tidy_tweets %>% 
  group_by(id, word, person) %>% 
  summarise(rts = first(retweets)) %>% #count retweets only once, not for every repetition of a word
  group_by(person, word) %>% 
  summarise(retweets = median(rts), uses = n()) %>%
  left_join(totals) %>%
  filter(retweets != 0) %>%
  ungroup()

word_by_rts %>% 
  filter(uses >= 5) %>%
  arrange(desc(retweets))
## # A tibble: 185 × 5
##    person word          retweets  uses total_rts
##    <chr>  <chr>            <dbl> <int>     <dbl>
##  1 David  animation         85       5     13022
##  2 David  stackoverflow     56       5     13022
##  3 David  download          52       5     13022
##  4 David  start             51       7     13022
##  5 Julia  tidytext          50       7      1751
##  6 David  gganimate         45       8     13022
##  7 David  introducing       45       6     13022
##  8 David  understanding     37       6     13022
##  9 David  error             34.5     8     13022
## 10 David  bayesian          34       7     13022
## # ℹ 175 more rows
# visualize most retweeted words
word_by_rts %>%
  filter(uses >= 5) %>%
  group_by(person) %>%
  top_n(10, retweets) %>%
  arrange(retweets) %>%
  ungroup() %>%
  mutate(word = factor(word, unique(word))) %>%
  ungroup() %>%
  ggplot(aes(word, retweets, fill = person)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ person, scales = "free", ncol = 2) +
  coord_flip() +
  labs(x = NULL, 
       y = "Median # of retweets for tweets containing each word")

# same for fovourites
totals <- tidy_tweets %>% 
  group_by(person, id) %>% 
  summarise(favs = first(favorites)) %>% 
  group_by(person) %>% 
  summarise(total_favs = sum(favs))

word_by_favs <- tidy_tweets %>% 
  group_by(id, word, person) %>% 
  summarise(favs = first(favorites)) %>% 
  group_by(person, word) %>% 
  summarise(favorites = median(favs), uses = n()) %>%
  left_join(totals) %>%
  filter(favorites != 0) %>%
  ungroup()

word_by_favs %>%
  filter(uses >= 5) %>%
  group_by(person) %>%
  top_n(10, favorites) %>%
  arrange(favorites) %>%
  ungroup() %>%
  mutate(word = factor(word, unique(word))) %>%
  ungroup() %>%
  ggplot(aes(word, favorites, fill = person)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ person, scales = "free", ncol = 2) +
  coord_flip() +
  labs(x = NULL, 
       y = "Median # of favorites for tweets containing each word")