Dictionary-Based Text Analysis

Chris Bail
Duke University

Word Counting

Word Counting w/Trump Tweets

 

library(tidytext)
library(dplyr)

load(url("https://cbail.github.io/Trump_Tweets.Rdata"))

tidy_trump_tweets<- trumptweets %>%
    select(created_at,text) %>%
      unnest_tokens("word", text)

Without Stop words

data("stop_words")

top_words<-
   tidy_trump_tweets %>%
      anti_join(stop_words) %>%
        filter(!(word=="https"|
                 word=="rt"|
                 word=="t.co"|
                 word=="amp")) %>%
            count(word) %>%
              arrange(desc(n))

Plot

library(ggplot2)
top_words %>%
  slice(1:20) %>%
    ggplot(aes(x=reorder(word, -n), y=n, fill=word))+
      geom_bar(stat="identity")+
        theme_minimal()+
        theme(axis.text.x = 
            element_text(angle = 60, hjust = 1, size=13))+
        theme(plot.title = 
            element_text(hjust = 0.5, size=18))+
          ylab("Frequency")+
          xlab("")+
          ggtitle("Most Frequent Words in Trump Tweets")+
          guides(fill=FALSE)

Plot

  plot of chunk unnamed-chunk-4

Term Frequency Inverse Document Frequency

Term Frequency Inverse Document Frequency

tf-idf

tidy_trump_tfidf<- trumptweets %>%
    select(created_at,text) %>%
      unnest_tokens("word", text) %>%
        anti_join(stop_words) %>%
             count(word, created_at) %>%
                bind_tf_idf(word, created_at, n)

tf-idf

top_tfidf<-tidy_trump_tfidf %>%
  arrange(desc(tf_idf)) 

top_tfidf$word[1]
[1] "standforouranthem"

tf-idf