We’ll explore some of the methods tidytext offers for calculating and visualizing relationships between words in your text dataset.
An n-grams is a consecutive sequences of n words. By seeing how often word X is followed by word Y, we can then build a model of the relationships between them.
library(tidyverse) library(tidytext) library(janeaustenr) austen_bigrams <- austen_books() %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% filter(!is.na(bigram)) austen_bigrams
## # A tibble: 662,783 × 2 ## book bigram ## <fct> <chr> ## 1 Sense & Sensibility sense and ## 2 Sense & Sensibility and sensibility ## 3 Sense & Sensibility by jane ## 4 Sense & Sensibility jane austen ## 5 Sense & Sensibility chapter 1 ## 6 Sense & Sensibility the family ## 7 Sense & Sensibility family of ## 8 Sense & Sensibility of dashwood ## 9 Sense & Sensibility dashwood had ## 10 Sense & Sensibility had long ## # … with 662,773 more rows
# count bigrams austen_bigrams %>% count(bigram, sort = TRUE)
## # A tibble: 193,209 × 2 ## bigram n ## <chr> <int> ## 1 of the 2853 ## 2 to be 2670 ## 3 in the 2221 ## 4 it was 1691 ## 5 i am 1485 ## 6 she had 1405 ## 7 of her 1363 ## 8 to the 1315 ## 9 she was 1309 ## 10 had been 1206 ## # … with 193,199 more rows
# remove stop words bigrams_separated <- austen_bigrams %>% separate(bigram, c("word1", "word2"), sep = " ") bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) # new bigram counts bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE) bigram_counts
## # A tibble: 28,974 × 3 ## word1 word2 n ## <chr> <chr> <int> ## 1 sir thomas 266 ## 2 miss crawford 196 ## 3 captain wentworth 143 ## 4 miss woodhouse 143 ## 5 frank churchill 114 ## 6 lady russell 110 ## 7 sir walter 108 ## 8 lady bertram 101 ## 9 miss fairfax 98 ## 10 colonel brandon 96 ## # … with 28,964 more rows
# back to bigrams bigrams_united <- bigrams_filtered %>% unite(bigram, word1, word2, sep = " ") bigrams_united
## # A tibble: 38,913 × 2 ## book bigram ## <fct> <chr> ## 1 Sense & Sensibility jane austen ## 2 Sense & Sensibility chapter 1 ## 3 Sense & Sensibility norland park ## 4 Sense & Sensibility surrounding acquaintance ## 5 Sense & Sensibility late owner ## 6 Sense & Sensibility advanced age ## 7 Sense & Sensibility constant companion ## 8 Sense & Sensibility happened ten ## 9 Sense & Sensibility henry dashwood ## 10 Sense & Sensibility norland estate ## # … with 38,903 more rows
# count 3-grams austen_books() %>% unnest_tokens(trigram, text, token = "ngrams", n = 3) %>% filter(!is.na(trigram)) %>% separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word, !word3 %in% stop_words$word) %>% count(word1, word2, word3, sort = TRUE)
## # A tibble: 6,140 × 4 ## word1 word2 word3 n ## <chr> <chr> <chr> <int> ## 1 dear miss woodhouse 20 ## 2 miss de bourgh 17 ## 3 lady catherine de 11 ## 4 poor miss taylor 11 ## 5 sir walter elliot 10 ## 6 catherine de bourgh 9 ## 7 dear sir thomas 8 ## 8 replied miss crawford 7 ## 9 sir william lucas 7 ## 10 ten thousand pounds 7 ## # … with 6,130 more rows