library(janeaustenr)
library(stringr)
# one sentence per row
austen_books()
## # A tibble: 73,422 × 2
## text book
## * <chr> <fct>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility
## 2 "" Sense & Sensibility
## 3 "by Jane Austen" Sense & Sensibility
## 4 "" Sense & Sensibility
## 5 "(1811)" Sense & Sensibility
## 6 "" Sense & Sensibility
## 7 "" Sense & Sensibility
## 8 "" Sense & Sensibility
## 9 "" Sense & Sensibility
## 10 "CHAPTER 1" Sense & Sensibility
## # ℹ 73,412 more rows
# add line and chapter numbers relative to books
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(
str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
ungroup()
original_books
## # A tibble: 73,422 × 4
## text book linenumber chapter
## <chr> <fct> <int> <int>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 "by Jane Austen" Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 "(1811)" Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
## 7 "" Sense & Sensibility 7 0
## 8 "" Sense & Sensibility 8 0
## 9 "" Sense & Sensibility 9 0
## 10 "CHAPTER 1" Sense & Sensibility 10 1
## # ℹ 73,412 more rows
# tokenize: one work per row
tidy_books <- original_books %>%
unnest_tokens(word, text)
tidy_books
## # A tibble: 725,055 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ℹ 725,045 more rows
# remove stop words
stop_words
## # A tibble: 1,149 × 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
## 7 accordingly SMART
## 8 across SMART
## 9 actually SMART
## 10 after SMART
## # ℹ 1,139 more rows
tidy_books <- tidy_books %>%
anti_join(stop_words)
# word frequency
tidy_books %>%
count(word, sort = TRUE)
## # A tibble: 13,914 × 2
## word n
## <chr> <int>
## 1 miss 1855
## 2 time 1337
## 3 fanny 862
## 4 dear 822
## 5 lady 817
## 6 sir 806
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## # ℹ 13,904 more rows
# plot word frequency
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
# reorder levels of factor word wrt n
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()

library(wordcloud)
tidy_books %>%
count(word) %>%
# evaluate an R expression in an environment constructed from data
with(wordcloud(word, n, max.words = 100))

# Porter's word stemming
library(SnowballC)
tidy_books <- tidy_books %>%
mutate(word = wordStem(word)) # stemming
# plot word frequency
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()

# tokenize by pattern (regular expression)
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text,
token = "regex",
pattern = "(Chapter|CHAPTER) [\\dIVXLC]{1,8}") %>%
ungroup()
# how many chapters in each book?
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n()) %>%
arrange(-chapters)
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Pride & Prejudice 62
## 2 Emma 56
## 3 Sense & Sensibility 51
## 4 Mansfield Park 49
## 5 Northanger Abbey 32
## 6 Persuasion 25