Chris Bail
Duke University
website: https://www.chrisbail.net
Twitter: https://www.twitter.com/chris_bail
github: https://github.com/cbail
library(topicmodels)
library(tm)
data("AssociatedPress")
inspect(AssociatedPress[1:5, 1:5])
<<DocumentTermMatrix (documents: 5, terms: 5)>>
Non-/sparse entries: 0/25
Sparsity : 100%
Maximal term length: 10
Weighting : term frequency (tf)
Sample :
Terms
Docs aaron abandon abandoned abandoning abbott
[1,] 0 0 0 0 0
[2,] 0 0 0 0 0
[3,] 0 0 0 0 0
[4,] 0 0 0 0 0
[5,] 0 0 0 0 0
AP_topic_model<-LDA(AssociatedPress,
k=10,
control = list(seed = 321))
library(tidytext)
library(dplyr)
AP_topics <- tidy(AP_topic_model, matrix = "beta")
ap_top_terms <-
AP_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
library(ggplot2)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>%
mutate(topic = paste("Topic #", topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
theme_minimal()+
theme(plot.title =
element_text(hjust = 0.5, size=18))+
labs(
title = "Topic Model of AP News Articles",
caption = "Top Terms by Topic (betas)"
)+
ylab("")+
xlab("")+
coord_flip()