Suppose a vandal has broken into your study and torn apart four of your books (RDS):

This vandal has torn the books into individual chapters, and left them in one large pile. How can we restore these disorganized chapters to their original books?

This is a challenging problem since the individual chapters are unlabeled: we don’t know what words might distinguish them into groups.

Use topic modeling to discover how chapters cluster into distinct topics, each of them (presumably) representing one of the books.

Download books

library(tidyverse)
library(tidytext)
library(topicmodels)
# library(gutenbergr)
# 
# titles <- c("Twenty Thousand Leagues under the Sea", 
#             "The War of the Worlds",
#             "Pride and Prejudice", 
#             "Great Expectations")
# 
# 
# books <- gutenberg_works(title %in% titles) %>%
#   gutenberg_download(meta_fields = "title")

#write_rds(books, "books.rds")
books = read_rds("books.rds")

Find topics with LDA

# divide into documents, each representing one chapter
by_chapter <- books %>%
  group_by(title) %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
  ungroup() %>%
  filter(chapter > 0) %>%
  unite(document, title, chapter)

# split into words
by_chapter_word <- by_chapter %>%
  unnest_tokens(word, text)

# find document-word counts
word_counts <- by_chapter_word %>%
  anti_join(stop_words) %>%
  count(document, word, sort = TRUE) %>%
  ungroup()

word_counts
## # A tibble: 104,722 × 3
##    document                 word        n
##    <chr>                    <chr>   <int>
##  1 Great Expectations_57    joe        88
##  2 Great Expectations_7     joe        70
##  3 Great Expectations_17    biddy      63
##  4 Great Expectations_27    joe        58
##  5 Great Expectations_38    estella    58
##  6 Great Expectations_2     joe        56
##  7 Great Expectations_23    pocket     53
##  8 Great Expectations_15    joe        50
##  9 Great Expectations_18    joe        50
## 10 The War of the Worlds_16 brother    50
## # ℹ 104,712 more rows
# cast into DTM
chapters_dtm <- word_counts %>%
  cast_dtm(document, word, n)

chapters_dtm
## <<DocumentTermMatrix (documents: 193, terms: 18215)>>
## Non-/sparse entries: 104722/3410773
## Sparsity           : 97%
## Maximal term length: 19
## Weighting          : term frequency (tf)
# create 4 topics with LDA
chapters_lda <- LDA(chapters_dtm, k = 4, control = list(seed = 1234))
chapters_lda
## A LDA_VEM topic model with 4 topics.

Analyse probabilities

# per-topic-per-word probabilities
chapter_topics <- tidy(chapters_lda, matrix = "beta")
chapter_topics
## # A tibble: 72,860 × 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 joe     2.44e-23
##  2     2 joe     8.88e-68
##  3     3 joe     1.40e-26
##  4     4 joe     1.42e- 2
##  5     1 biddy   1.29e-32
##  6     2 biddy   7.63e-81
##  7     3 biddy   3.25e-52
##  8     4 biddy   4.69e- 3
##  9     1 estella 9.28e- 8
## 10     2 estella 1.93e-76
## # ℹ 72,850 more rows
# top 5 terms within each topic
top_terms <- chapter_topics %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms
## # A tibble: 20 × 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 elizabeth 0.0144 
##  2     1 darcy     0.00901
##  3     1 miss      0.00874
##  4     1 bennet    0.00710
##  5     1 jane      0.00665
##  6     2 captain   0.0155 
##  7     2 nautilus  0.0131 
##  8     2 sea       0.00885
##  9     2 nemo      0.00872
## 10     2 ned       0.00804
## 11     3 people    0.00678
## 12     3 martians  0.00644
## 13     3 time      0.00535
## 14     3 black     0.00526
## 15     3 night     0.00450
## 16     4 joe       0.0142 
## 17     4 time      0.00682
## 18     4 pip       0.00670
## 19     4 looked    0.00632
## 20     4 miss      0.00626
# visualize top terms
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

# per-document-per-topic probabilities
chapters_gamma <- tidy(chapters_lda, matrix = "gamma")
chapters_gamma
## # A tibble: 772 × 3
##    document                 topic      gamma
##    <chr>                    <int>      <dbl>
##  1 Great Expectations_57        1 0.0000116 
##  2 Great Expectations_7         1 0.0000126 
##  3 Great Expectations_17        1 0.0000182 
##  4 Great Expectations_27        1 0.0000165 
##  5 Great Expectations_38        1 0.341     
##  6 Great Expectations_2         1 0.0000148 
##  7 Great Expectations_23        1 0.531     
##  8 Great Expectations_15        1 0.0000123 
##  9 Great Expectations_18        1 0.0000109 
## 10 The War of the Worlds_16     1 0.00000931
## # ℹ 762 more rows
# separate title and chapter
chapters_gamma <- chapters_gamma %>%
  separate(document, c("title", "chapter"), sep = "_", convert = TRUE)

chapters_gamma
## # A tibble: 772 × 4
##    title                 chapter topic      gamma
##    <chr>                   <int> <int>      <dbl>
##  1 Great Expectations         57     1 0.0000116 
##  2 Great Expectations          7     1 0.0000126 
##  3 Great Expectations         17     1 0.0000182 
##  4 Great Expectations         27     1 0.0000165 
##  5 Great Expectations         38     1 0.341     
##  6 Great Expectations          2     1 0.0000148 
##  7 Great Expectations         23     1 0.531     
##  8 Great Expectations         15     1 0.0000123 
##  9 Great Expectations         18     1 0.0000109 
## 10 The War of the Worlds      16     1 0.00000931
## # ℹ 762 more rows
# visualize the per-document-per-topic probability
chapters_gamma %>%
  mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(factor(topic), gamma)) +
  geom_boxplot() +
  facet_wrap(~ title)

We notice that almost all of the chapters from Pride and Prejudice, War of the Worlds, and Twenty Thousand Leagues Under the Sea were uniquely identified as a single topic each. It does look like some chapters from Great Expectations (which should be topic 4) were somewhat associated with other topics.

Miss-classification

Are there any cases where the topic most associated with a chapter belonged to another book?

chapter_classifications <- chapters_gamma %>%
  group_by(title, chapter) %>%
  top_n(1, gamma) %>%
  ungroup()

chapter_classifications
## # A tibble: 193 × 4
##    title               chapter topic gamma
##    <chr>                 <int> <int> <dbl>
##  1 Great Expectations       23     1 0.531
##  2 Pride and Prejudice      43     1 1.00 
##  3 Pride and Prejudice      18     1 1.00 
##  4 Pride and Prejudice      45     1 1.00 
##  5 Pride and Prejudice      16     1 1.00 
##  6 Pride and Prejudice      29     1 1.00 
##  7 Pride and Prejudice      10     1 1.00 
##  8 Pride and Prejudice       8     1 1.00 
##  9 Pride and Prejudice      56     1 1.00 
## 10 Pride and Prejudice      47     1 1.00 
## # ℹ 183 more rows
# get consensus topic for each book (the most common topic among its chapters)
book_topics <- chapter_classifications %>%
  count(title, topic) %>%
  group_by(title) %>%
  top_n(1, n) %>%
  ungroup() %>%
  transmute(consensus = title, topic)

chapter_classifications %>%
  inner_join(book_topics, by = "topic") %>%
  filter(title != consensus)
## # A tibble: 2 × 5
##   title              chapter topic gamma consensus            
##   <chr>                <int> <int> <dbl> <chr>                
## 1 Great Expectations      23     1 0.531 Pride and Prejudice  
## 2 Great Expectations      54     3 0.507 The War of the Worlds
chapters_gamma %>% 
  filter(title == "Great Expectations", chapter == 23)
## # A tibble: 4 × 4
##   title              chapter topic     gamma
##   <chr>                <int> <int>     <dbl>
## 1 Great Expectations      23     1 0.531    
## 2 Great Expectations      23     2 0.0000158
## 3 Great Expectations      23     3 0.0000158
## 4 Great Expectations      23     4 0.469
chapters_gamma %>% 
  filter(title == "Great Expectations", chapter == 54)
## # A tibble: 4 × 4
##   title              chapter topic      gamma
##   <chr>                <int> <int>      <dbl>
## 1 Great Expectations      54     1 0.00000916
## 2 Great Expectations      54     2 0.0387    
## 3 Great Expectations      54     3 0.507     
## 4 Great Expectations      54     4 0.454

We see that only two chapters from Great Expectations were misclassified, as LDA described one as coming from the Pride and Prejudice topic (topic 1) and one from The War of the Worlds (topic 3). That’s not bad for unsupervised clustering!