source('create_datasets.R')
## Need this line so qdap will load correctly (with java) when using knitr button.
dyn.load('/Library/Java/JavaVirtualMachines/jdk1.8.0_131.jdk/Contents/Home/jre/lib/server/libjvm.dylib')
library(qdap)
library(dplyr)
library(tm)
library(wordcloud)
library(plotrix)
library(dendextend)
library(ggplot2)
library(ggthemes)
library(RWeka)
The simple definition:
Workflow:
Two types of text mining:
# qdap is loaded
# Print new_text to the console
new_text <- "DataCamp is the first online learning platform that focuses on building the best learning experience specifically for Data Science. We have offices in Boston and Belgium and to date, we trained over 250,000 (aspiring) data scientists in over 150 countries. These data science enthusiasts completed more than 9 million exercises. You can take free beginner courses, or subscribe for $25/month to get access to all premium courses."
# Find the 10 most frequent terms: term_count
term_count <- freq_terms(new_text, 10)
# Plot term_count
plot(term_count)
# Import text data
tweets <- read.csv('https://assets.datacamp.com/production/course_935/datasets/coffee.csv', stringsAsFactors = F)
# View the structure of tweets
glimpse(tweets)
## Observations: 1,000
## Variables: 15
## $ num <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15...
## $ text <chr> "@ayyytylerb that is so true drink lots of coffee...
## $ favorited <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, ...
## $ replyToSN <chr> "ayyytylerb", NA, NA, NA, NA, NA, NA, "dreamwwedi...
## $ created <chr> "8/9/2013 2:43", "8/9/2013 2:43", "8/9/2013 2:43"...
## $ truncated <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, ...
## $ replyToSID <dbl> 3.65664e+17, NA, NA, NA, NA, NA, NA, 3.65664e+17,...
## $ id <dbl> 3.65665e+17, 3.65665e+17, 3.65665e+17, 3.65665e+1...
## $ replyToUID <int> 1637123977, NA, NA, NA, NA, NA, NA, 1316942208, N...
## $ statusSource <chr> "<a href=\"http://twitter.com/download/iphone\" r...
## $ screenName <chr> "thejennagibson", "carolynicosia", "janeCkay", "A...
## $ retweetCount <int> 0, 1, 0, 0, 2, 0, 0, 0, 1, 2, 0, 6, 0, 6, 0, 42, ...
## $ retweeted <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, ...
## $ longitude <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ latitude <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
# Print out the number of rows in tweets
nrow(tweets)
## [1] 1000
# Isolate text from tweets: coffee_tweets
coffee_tweets <- tweets$text
head(coffee_tweets)
## [1] "@ayyytylerb that is so true drink lots of coffee"
## [2] "RT @bryzy_brib: Senior March tmw morning at 7:25 A.M. in the SENIOR lot. Get up early, make yo coffee/breakfast, cus this will only happen ?"
## [3] "If you believe in #gunsense tomorrow would be a very good day to have your coffee any place BUT @Starbucks Guns+Coffee=#nosense @MomsDemand"
## [4] "My cute coffee mug. http://t.co/2udvMU6XIG"
## [5] "RT @slaredo21: I wish we had Starbucks here... Cause coffee dates in the morning sound perff!"
## [6] "Does anyone ever get a cup of coffee before a cocktail??"
# the tm library is loaded
# Make a vector source: coffee_source
coffee_source <- VectorSource(coffee_tweets)
# Make a volatile corpus: coffee_corpus
coffee_corpus <- VCorpus(coffee_source)
# Print out coffee_corpus
coffee_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1000
# Print data on the 15th tweet in coffee_corpus
coffee_corpus[[15]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 111
# Print the content of the 15th tweet in coffee_corpus
coffee_corpus[[15]]$content
## [1] "@HeatherWhaley I was about 2 joke it takes 2 hands to hold hot coffee...then I read headline! #Don'tDrinkNShoot"
# Print example_text to the console
example_text <- structure(list(num = 1:3, Author1 = c("Text mining is a great time.",
"Text analysis provides insights", "qdap and tm are used in text mining"
), Author2 = c("R is a great language", "R has many uses", "DataCamp is cool!"
)), .Names = c("num", "Author1", "Author2"), row.names = c(NA,
-3L), class = "data.frame")
example_text
## num Author1 Author2
## 1 1 Text mining is a great time. R is a great language
## 2 2 Text analysis provides insights R has many uses
## 3 3 qdap and tm are used in text mining DataCamp is cool!
# Create a DataframeSource on columns 2 and 3: df_source
df_source <- DataframeSource(example_text[,2:3])
df_source
## $encoding
## [1] ""
##
## $length
## [1] 3
##
## $position
## [1] 0
##
## $reader
## function (elem, language, id)
## {
## if (!is.null(elem$uri))
## id <- basename(elem$uri)
## PlainTextDocument(elem$content, id = id, language = language)
## }
## <environment: namespace:tm>
##
## $content
## Author1 Author2
## 1 Text mining is a great time. R is a great language
## 2 Text analysis provides insights R has many uses
## 3 qdap and tm are used in text mining DataCamp is cool!
##
## attr(,"class")
## [1] "DataframeSource" "SimpleSource" "Source"
# Convert df_source to a corpus: df_corpus
df_corpus <- VCorpus(df_source)
# Examine df_corpus
df_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
df_corpus[[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 52
df_corpus[[3]]$content
## [1] "qdap and tm are used in text mining"
## [2] "DataCamp is cool!"
# Create a VectorSource on column 3: vec_source
vec_source <- VectorSource(example_text[,3])
# Convert vec_source to a corpus: vec_corpus
vec_corpus <- VCorpus(vec_source)
# Examine vec_corpus
vec_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
vec_corpus[[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 17
vec_corpus[[3]]$content
## [1] "DataCamp is cool!"
Common preprocessing functions and uses:
Form base R:
tolower
From tm package:
removePunctiation
removeNumbers
stripWhiteSpace
removeWords
Other useful tm functions
tm_map
takes a corpus and a processing funciton and transforms the corpus
content_transformer()
functionstemDocument
functions can also be very helpful
From the qdap package
bracketX()
: Remove all text within brackets (e.g. “It’s (so) cool” becomes “It’s cool”)replace_number()
: Replace numbers with their word equivalents (e.g. “2” becomes “two”)replace_abbreviation()
: Replace abbreviations with their full text equivalents (e.g. “Sr” becomes “Senior”)replace_contraction()
: Convert contractions back to their base words (e.g. “shouldn’t” becomes “should not”)replace_symbol()
: Replace common symbols with their word equivalents (e.g. “$” becomes “dollar”)# Create the object: text
text <- "<b>She</b> woke up at 6 A.M. It\'s so early! She was only 10% awake and began drinking coffee in front of her computer."
# All lowercase
tolower(text)
## [1] "<b>she</b> woke up at 6 a.m. it's so early! she was only 10% awake and began drinking coffee in front of her computer."
# Remove punctuation
removePunctuation(text)
## [1] "bSheb woke up at 6 AM Its so early She was only 10 awake and began drinking coffee in front of her computer"
# Remove numbers
removeNumbers(text)
## [1] "<b>She</b> woke up at A.M. It's so early! She was only % awake and began drinking coffee in front of her computer."
# Remove whitespace
stripWhitespace(text)
## [1] "<b>She</b> woke up at 6 A.M. It's so early! She was only 10% awake and began drinking coffee in front of her computer."
## text is still loaded in your workspace
text
## [1] "<b>She</b> woke up at 6 A.M. It's so early! She was only 10% awake and began drinking coffee in front of her computer."
# Remove text within brackets
bracketX(text)
## [1] "She woke up at 6 A.M. It's so early! She was only 10% awake and began drinking coffee in front of her computer."
# Replace numbers with words
replace_number(text)
## [1] "<b>She</b> woke up at six A.M. It's so early! She was only ten% awake and began drinking coffee in front of her computer."
# Replace abbreviations
replace_abbreviation(text)
## [1] "<b>She</b> woke up at 6 AM It's so early! She was only 10% awake and began drinking coffee in front of her computer."
# Replace contractions
replace_contraction(text)
## [1] "<b>She</b> woke up at 6 A.M. it is so early! She was only 10% awake and began drinking coffee in front of her computer."
# Replace symbols with words
replace_symbol(text)
## [1] "<b>She</b> woke up at 6 A.M. It's so early! She was only 10 percent awake and began drinking coffee in front of her computer."
## text is preloaded into your workspace
text
## [1] "<b>She</b> woke up at 6 A.M. It's so early! She was only 10% awake and began drinking coffee in front of her computer."
# List standard English stop words
stopwords("en")
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very"
# Print text without standard stop words
removeWords(text, stopwords('en'))
## [1] "<b>She</b> woke 6 A.M. It's early! She 10% awake began drinking coffee front computer."
# Add "coffee" and "bean" to the list: new_stops
new_stops <- c("coffee", "bean", stopwords("en"))
# Remove stop words from text
removeWords(text, new_stops)
## [1] "<b>She</b> woke 6 A.M. It's early! She 10% awake began drinking front computer."
SnowballC
package for the stemDocument
function to work# Create complicate
complicate <- c("complicated", "complication", "complicatedly")
# Perform word stemming: stem_doc
stem_doc <- stemDocument(complicate)
# Create the completion dictionary: comp_dict
comp_dict <- c("complicate")
# Perform stem completion: complete_text
complete_text <- stemCompletion(stem_doc, comp_dict)
# Print complete_text
complete_text
## complic complic complic
## "complicate" "complicate" "complicate"
stemDocument
treats the whole sentence as one wordtext_data <- "In a complicated haste, Tom rushed to fix a new complication, too complicatedly."
stemDocument(text_data)
## [1] "In a complic haste, Tom rush to fix a new complication, too complicatedly."
Here is the correct way to stem a sentence
# Remove punctuation: rm_punc
rm_punc <- removePunctuation(text_data)
# Create character vector: n_char_vec
n_char_vec <- unlist(strsplit(rm_punc, split = ' '))
# Perform word stemming: stem_doc
stem_doc <- stemDocument(n_char_vec)
# Print stem_doc
stem_doc
## [1] "In" "a" "complic" "hast" "Tom" "rush" "to"
## [8] "fix" "a" "new" "complic" "too" "complic"
# Re-complete stemmed document: complete_doc
comp_dict
## [1] "complicate"
complete_doc <- stemCompletion(stem_doc, comp_dict)
# Print complete_doc
complete_doc
## In a complic hast Tom
## "" "" "complicate" "" ""
## rush to fix a new
## "" "" "" "" ""
## complic too complic
## "complicate" "" "complicate"
tm_map
function is used to apply a processing function to a corpustm
package functions do not need content_transformer()
, but base R and qdap
functions do.# Alter the function code to match the instructions
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "coffee", "mug"))
return(corpus)
}
# Apply your customized function to the tweet_corp: clean_corp
tweet_corp
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 20
coffee_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1000
clean_corp <- clean_corpus(coffee_corpus)
# Print out a cleaned up tweet
clean_corp[[20]][1]
## $content
## [1] " wonder christian colon will get cup rosters expand 40 man september really nothing lose "
# Print out the same tweet in original form
tweet_corp[[20]][1]
## $content
## [1] "I wonder if Christian Colon will get a cup of coffee once the rosters expand to 40 man in September. Really nothing to lose by doing so."
# Create the dtm from the corpus: coffee_dtm
clean_corp
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1000
coffee_dtm <- DocumentTermMatrix(clean_corp)
# Print out coffee_dtm data
coffee_dtm
## <<DocumentTermMatrix (documents: 1000, terms: 3075)>>
## Non-/sparse entries: 7384/3067616
## Sparsity : 100%
## Maximal term length: 27
## Weighting : term frequency (tf)
# Convert coffee_dtm to a matrix: coffee_m
coffee_m <- as.matrix(coffee_dtm)
# Print the dimensions of coffee_m
dim(coffee_m)
## [1] 1000 3075
# Review a portion of the matrix
coffee_m[14:16, 100:105]
## Terms
## Docs alot already alright also always alwayzremember
## 14 0 0 0 0 0 0
## 15 0 0 0 0 0 0
## 16 0 0 0 0 0 0
# Create a TDM from clean_corp: coffee_tdm
coffee_tdm <- TermDocumentMatrix(clean_corp)
# Print coffee_tdm data
coffee_tdm
## <<TermDocumentMatrix (terms: 3075, documents: 1000)>>
## Non-/sparse entries: 7384/3067616
## Sparsity : 100%
## Maximal term length: 27
## Weighting : term frequency (tf)
# Convert coffee_tdm to a matrix: coffee_m
coffee_m <- as.matrix(coffee_tdm)
# Print the dimensions of the matrix
dim(coffee_m)
## [1] 3075 1000
# Review a portion of the matrix
coffee_m[ 100:105, 14:16]
## Docs
## Terms 14 15 16
## alot 0 0 0
## already 0 0 0
## alright 0 0 0
## also 0 0 0
## always 0 0 0
## alwayzremember 0 0 0
# Create a matrix: coffee_m
coffee_m <- as.matrix(coffee_tdm)
# Calculate the rowSums: term_frequency
term_frequency <- rowSums(coffee_m)
# Sort term_frequency in descending order
term_frequency <- sort(term_frequency, decreasing = T)
# View the top 10 most common words
term_frequency[1:10]
## like cup shop just get morning want drinking
## 111 103 69 66 62 57 49 47
## can looks
## 45 45
# Plot a barchart of the 10 most common words
barplot(term_frequency[1:10], col = "tan", las = 2)
head(tweets$text)
## [1] "@ayyytylerb that is so true drink lots of coffee"
## [2] "RT @bryzy_brib: Senior March tmw morning at 7:25 A.M. in the SENIOR lot. Get up early, make yo coffee/breakfast, cus this will only happen ?"
## [3] "If you believe in #gunsense tomorrow would be a very good day to have your coffee any place BUT @Starbucks Guns+Coffee=#nosense @MomsDemand"
## [4] "My cute coffee mug. http://t.co/2udvMU6XIG"
## [5] "RT @slaredo21: I wish we had Starbucks here... Cause coffee dates in the morning sound perff!"
## [6] "Does anyone ever get a cup of coffee before a cocktail??"
head(Top200Words)
## [1] "the" "of" "and" "a" "to" "in"
# Create frequency
frequency <- freq_terms(tweets$text,
top = 10,
stopwords = c(Top200Words, "coffee"),
at.least = 3
)
# Make a frequency barchart
plot(frequency)
# Create frequency2
frequency2 <- freq_terms(tweets$text,
top = 10,
at.least = 3,
stopwords = c(tm::stopwords("en"), "coffee")
)
# Make a frequency2 barchart
plot(frequency2)
# Import chardonnay tweet data
chardonnay_tweets <- read.csv('https://assets.datacamp.com/production/course_935/datasets/chardonnay.csv', stringsAsFactors = F)
head(chardonnay_tweets$text)
## [1] "RT @oceanclub: @eilisohanlon @stonyjim @vonprond Eilis, I'm from Pearse St and even I can tell a Chardonnay (smells like cat's pee) from so?"
## [2] "?@roystbaggage: 'Go to your Auntie Chardonnay and she will help you piss up against that wall' - the scum of Dover.?what's this even mean ha"
## [3] "Big thank you to Ian at Fowles wine for making me a Chardonnay drinker. @LadiesWhoShoot #wrongwayround http://t.co/KiA2StsOEO"
## [4] "RT @oceanclub: @eilisohanlon @stonyjim @vonprond Eilis, I'm from Pearse St and even I can tell a Chardonnay (smells like cat's pee) from so?"
## [5] "After to worst rain at night known, floating flip flops, we have woke to the hottest sun!! #pool #sunshine #Chardonnay #happyhols <ed><U+00A0><U+00BD><ed><U+00B8><U+008A><ed><U+00A0><U+00BC><ed><U+00BD><U+00B7><ed><U+00A0><U+00BC><ed><U+00BD><U+00BA><ed><U+00A0><U+00BD><ed><U+00B1><U+008B>"
## [6] "@eilisohanlon @stonyjim @vonprond Eilis, I'm from Pearse St and even I can tell a Chardonnay (smells like cat's pee) from something else."
# Make a vector source
chardonnay_source <- VectorSource(chardonnay_tweets$text)
# Make a volatile corpus
chardonnay_corpus <- VCorpus(chardonnay_source)
# Clean the corpus
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
return(corpus)
}
chardonnay_clean_corp <- clean_corpus(chardonnay_corpus)
# Convert TDM to matrix
chardonnay_tdm <- TermDocumentMatrix(chardonnay_clean_corp)
chardonnay_m <- as.matrix(chardonnay_tdm)
# Sum rows and frequency data frame
chardonnay_term_freq <- rowSums(chardonnay_m)
head(chardonnay_term_freq)
## 0nanist 100 1039thebear 1080btl 10km 10th
## 1 7 1 1 1 1
chardonnay_word_freqs <- data.frame(
term = names(chardonnay_term_freq),
num = chardonnay_term_freq
)
head(chardonnay_word_freqs)
## term num
## 0nanist 0nanist 1
## 100 100 7
## 1039thebear 1039thebear 1
## 1080btl 1080btl 1
## 10km 10km 1
## 10th 10th 1
# The wordcloud package is loaded
# Create a wordcloud for the values in word_freqs
wordcloud(chardonnay_word_freqs$term, chardonnay_word_freqs$num,
max.words = 100, colors = "red")
# Add new stop words to clean_corpus()
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "amp", "chardonnay", "wine", "glass"))
return(corpus)
}
# Create clean_chardonnay
chardonnay_clean_corp <- clean_corpus(chardonnay_corpus)
# Create chardonnay_tdm
chardonnay_tdm <- TermDocumentMatrix(chardonnay_clean_corp)
# Create chardonnay_m
chardonnay_m <- as.matrix(chardonnay_tdm)
# Create chardonnay_words
chardonnay_words <- rowSums(chardonnay_m)
# Sort the chardonnay_words in descending order
chardonnay_words <- sort(chardonnay_words, decreasing = T)
# Print the 6 most frequent chardonnay terms
head(chardonnay_words)
## marvin gaye just like bottle lol
## 104 76 75 55 47 43
# Create chardonnay_freqs
chardonnay_freqs <- data.frame(
term = names(chardonnay_words),
num = chardonnay_words
)
# Create a wordcloud for the values in word_freqs
wordcloud(chardonnay_freqs$term, chardonnay_freqs$num,
max.words = 50, colors = 'red')
# Print the list of colors
head(colors(),50)
## [1] "white" "aliceblue" "antiquewhite" "antiquewhite1"
## [5] "antiquewhite2" "antiquewhite3" "antiquewhite4" "aquamarine"
## [9] "aquamarine1" "aquamarine2" "aquamarine3" "aquamarine4"
## [13] "azure" "azure1" "azure2" "azure3"
## [17] "azure4" "beige" "bisque" "bisque1"
## [21] "bisque2" "bisque3" "bisque4" "black"
## [25] "blanchedalmond" "blue" "blue1" "blue2"
## [29] "blue3" "blue4" "blueviolet" "brown"
## [33] "brown1" "brown2" "brown3" "brown4"
## [37] "burlywood" "burlywood1" "burlywood2" "burlywood3"
## [41] "burlywood4" "cadetblue" "cadetblue1" "cadetblue2"
## [45] "cadetblue3" "cadetblue4" "chartreuse" "chartreuse1"
## [49] "chartreuse2" "chartreuse3"
# Print the wordcloud with the specified colors
wordcloud(chardonnay_freqs$term,
chardonnay_freqs$num,
max.words = 100,
colors = c("grey80","darkgoldenrod1", "tomato")
)
Here’s an example:
green_pal <- brewer.pal(8, "Greens")
munsell::plot_hex(green_pal)
green_pal <- green_pal[-(1:2)]
munsell::plot_hex(green_pal)
# List the available colors
display.brewer.all()
# Create purple_orange
purple_orange <- brewer.pal(10, "PuOr")
# Drop 2 faintest colors
## In this case the faintest colors are in the middle as it fades from orange to purple
munsell::plot_hex(purple_orange)
purple_orange <- purple_orange[-(5:6)]
# Create a wordcloud with purple_orange palette
wordcloud(chardonnay_freqs$term,
chardonnay_freqs$num,
max.words = 100,
colors = purple_orange)
# Combine both corpora: all_tweets
all_coffee <- paste(coffee_tweets, collapse = "")
all_chardonnay <- paste(chardonnay_tweets$text, collapse = "")
all_tweets <- c(all_coffee, all_chardonnay)
# clean all_tweets
all_tweets <- VectorSource(all_tweets)
all_corpus <- VCorpus(all_tweets)
# Add new stop words to clean_corpus()
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "amp", "chardonnay", "wine", "glass", "coffee"))
return(corpus)
}
all_clean <- clean_corpus(all_corpus)
all_tdm <- TermDocumentMatrix(all_clean)
all_m <- as.matrix(all_tdm)
# Make commonalitiy cloud
commonality.cloud(all_m,
colors = "steelblue1",
max.words = 100)
chocolate
.# Clean the corpus
all_clean <- clean_corpus(all_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_clean)
# Give the columns distinct names
colnames(all_tdm) <- c("coffee", "chardonnay")
# Create all_m
all_m <- as.matrix(all_tdm)
# Create comparison cloud
comparison.cloud(all_m,
colors = c("orange", "blue"),
max.words = 50)
commonality.cloud
is that it does not show you where the words show up proportonately.pyramid.plot
shows you both the common words and where there show up most.
# Identify terms shared by both documents
common_words <- subset(
all_m,
all_m[, 1] > 0 & all_m[, 2] > 0
)
head(common_words)
## Docs
## Terms coffee chardonnay
## actually 7 2
## aint 2 6
## airport 1 1
## almond 2 1
## almost 4 1
## alot 1 2
# calc common words and difference
difference <- abs(common_words[, 1] - common_words[, 2])
common_words <- cbind(common_words, difference)
common_words <- common_words[order(common_words[, 3],
decreasing = T), ]
head(common_words)
## coffee chardonnay difference
## cup 98 1 97
## shop 63 2 61
## like 109 54 55
## morning 48 1 47
## bottle 3 46 43
## want 49 10 39
top25_df <- data.frame(x = common_words[1:25, 1],
y = common_words[1:25, 2],
labels = rownames(common_words[1:25, ]))
# The plotrix package has been loaded
# Make pyramid plot
pyramid.plot(top25_df$x, top25_df$y,
labels = top25_df$labels,
main = "Words in Common",
gap = 18,
laxlab = NULL,
raxlab = NULL,
unit = NULL,
top.labels = c("Coffee",
"Words",
"Chardonnay")
)
## [1] 5.1 4.1 4.1 2.1
# Create word network
word_associate(
coffee_tweets,
match.string = c("barista"),
stopwords = c(Top200Words, "coffee", "amp"),
network.plot = T,
cloud.colors = c("gray85", "darkred")
)
## row group unit text
## 1 544 all 544 RT @Barista_kyo: #coffee #latte #soylatte #thinkcoffee # # # # @ think coffee http://t.co/Hmy9RPRWTZ
## 2 569 all 569 RT @ReversoSmith: What a beautiful mess! #portafilter #coffee #espresso #coffeemachine #barista #baristalife? http://t.co/ZODcTfP22Z
## 3 658 all 658 The moment you realize your Starbucks barista gave you a regular iced Coffee when u asked 4 decaf. Shitty. Late night not planned.
## 4 931 all 931 Barista made my coffee wrong and still gave me both anyway #Starbucks #coffee #caffeine #upallnight http://t.co/iKCNwO8F6t
## 5 951 all 951 RT @FrankIero: hahaha @jamiasan :*gives Barista our Starbucks order* Barista: coffee? @jamiasan : yes, isn't this is a coffee store?
# Add title
title(main = "Barista Coffee Tweet Associations")
rain <- structure(list(city = structure(c(2L, 4L, 1L, 3L), .Label = c("Boston",
"Cleveland", "New Orleans", "Portland"), class = "factor"), rainfall = c(39.14,
39.14, 43.77, 62.45)), .Names = c("city", "rainfall"), row.names = c(NA,
-4L), class = "data.frame")
rain
## city rainfall
## 1 Cleveland 39.14
## 2 Portland 39.14
## 3 Boston 43.77
## 4 New Orleans 62.45
# Create dist_rain
dist_rain <- dist(rain[ ,2])
# View the distance matrix
dist_rain
## 1 2 3
## 2 0.00
## 3 4.63 4.63
## 4 23.31 23.31 18.68
# Create hc
hc <- hclust(dist_rain)
# Plot hc
plot(hc, labels = rain$city)
# Print the dimensions of tweets_tdm
dim(chardonnay_tdm)
## [1] 3063 1000
# Create tdm1
tdm1 <- removeSparseTerms(chardonnay_tdm, sparse = 0.95)
# Create tdm2
tdm2 <- removeSparseTerms(chardonnay_tdm, sparse = 0.975)
# Print tdm1
print(tdm1)
## <<TermDocumentMatrix (terms: 4, documents: 1000)>>
## Non-/sparse entries: 305/3695
## Sparsity : 92%
## Maximal term length: 6
## Weighting : term frequency (tf)
dim(tdm1)
## [1] 4 1000
# Print tdm2
print(tdm2)
## <<TermDocumentMatrix (terms: 10, documents: 1000)>>
## Non-/sparse entries: 519/9481
## Sparsity : 95%
## Maximal term length: 6
## Weighting : term frequency (tf)
dim(tdm2)
## [1] 10 1000
dist()
# Create tweets_tdm2
chardonnay_tdm2 <- removeSparseTerms(chardonnay_tdm, sparse = 0.975)
# Create tdm_m
tdm_m <- as.matrix(chardonnay_tdm2)
tdm_m[1:10, 1:20]
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## bottle 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## dont 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gaye 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0
## get 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## just 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 0 1 0
## like 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## little 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0
## lol 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## marvin 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0
## rose 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# Create tdm_df
tdm_df <- as.data.frame(tdm_m)
head(tdm_df[,1:20])
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## bottle 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## dont 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## gaye 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0
## get 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## just 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 0 1 0
## like 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# Create chardonnay_dist
chardonnay_dist <- dist(tdm_df)
head(chardonnay_dist)
## [1] 9.000000 11.090537 8.888194 10.862780 10.198039 9.165151
# Create hc
hc <- hclust(chardonnay_dist)
hc
##
## Call:
## hclust(d = chardonnay_dist)
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 10
# Plot the dendrogram
plot(hc)
# dendextend library is loaded
# Create hcd
hcd <- as.dendrogram(hc)
# Print the labels in hcd
labels(hcd)
## [1] "gaye" "marvin" "just" "like" "lol" "bottle" "rose"
## [8] "little" "dont" "get"
# Change the branch color to red for "marvin" and "gaye"
hcd <- branches_attr_by_labels(hcd, c("marvin", "gaye"), color = "red")
# Plot hcd
plot(hcd)
# Add cluster rectangles
rect.dendrogram(hcd, k = 2, border = "grey50")
findAssocs()
function in the tm package.
findAssocs()
calculates its correlation with every other word in a TDM or DTM.findAssocs()
pass in a TDM or DTM, the search term, and a minimum correlation.
# Create associations
associations <- findAssocs(coffee_tdm, "venti", 0.2)
# View the venti associations
associations
## $venti
## breve drizzle entire pumps extra cuz forget
## 0.58 0.58 0.58 0.58 0.47 0.41 0.41
## okay hyper mocha vanilla wtf always asleep
## 0.41 0.33 0.33 0.33 0.29 0.26 0.26
## get starbucks white
## 0.25 0.25 0.23
# Create associations_df
associations_df <- list_vect2df(associations)[, 2:3]
head(associations_df)
## X2 X3
## 1 white 0.23
## 2 get 0.25
## 3 starbucks 0.25
## 4 always 0.26
## 5 asleep 0.26
## 6 wtf 0.29
# Plot the associations_df values (don't change this)
ggplot(associations_df, aes(y = associations_df[, 1])) +
geom_point(aes(x = associations_df[, 2]),
data = associations_df, size = 3) +
ggtitle("Word Associations to 'Venti'") +
theme_gdocs()
Here is a simple example showing how the bigram will have more terms than the unigram:
# Use only first 2 coffee tweets
coffee_tweets[1:2]
## [1] "@ayyytylerb that is so true drink lots of coffee"
## [2] "RT @bryzy_brib: Senior March tmw morning at 7:25 A.M. in the SENIOR lot. Get up early, make yo coffee/breakfast, cus this will only happen ?"
coffee_source_small <- VectorSource(coffee_tweets[1:2])
coffee_corpus_small <- VCorpus(coffee_source_small)
# Make a unigram DTM on first 2 coffee tweets
unigram_dtm <- DocumentTermMatrix(coffee_corpus_small)
unigram_dtm
## <<DocumentTermMatrix (documents: 2, terms: 24)>>
## Non-/sparse entries: 24/24
## Sparsity : 50%
## Maximal term length: 17
## Weighting : term frequency (tf)
# The RWeka package is already loaded
# Define bigram tokenizer
tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
# Make a bigram TDM
bigram_tdm <- TermDocumentMatrix(
clean_corpus(coffee_corpus_small),
control = list(tokenize = tokenizer)
)
bigram_tdm
## <<TermDocumentMatrix (terms: 18, documents: 2)>>
## Non-/sparse entries: 18/18
## Sparsity : 50%
## Maximal term length: 19
## Weighting : term frequency (tf)
Here is another example with the full chardonnay_corpus
# Make tokenizer function
tokenizer <- function(x)
NGramTokenizer(x, Weka_control(min = 2, max = 2))
# Create unigram_dtm
unigram_dtm <- DocumentTermMatrix(chardonnay_clean_corp)
# Create bigram_dtm
bigram_dtm <- DocumentTermMatrix(
chardonnay_clean_corp,
control = list(tokenize = tokenizer))
# Examine unigram_dtm
unigram_dtm
## <<DocumentTermMatrix (documents: 1000, terms: 3063)>>
## Non-/sparse entries: 7142/3055858
## Sparsity : 100%
## Maximal term length: 266
## Weighting : term frequency (tf)
# Examine bigram_dtm
bigram_dtm
## <<DocumentTermMatrix (documents: 1000, terms: 4945)>>
## Non-/sparse entries: 6846/4938154
## Sparsity : 100%
## Maximal term length: 271
## Weighting : term frequency (tf)
http
would be the largest term in the word cloud. whoops# Create bigram_dtm_m
bigram_dtm_m <- as.matrix(bigram_dtm)
# Create freq
freq <- colSums(bigram_dtm_m)
# Create bi_words
bi_words <- names(freq)
# Examine part of bi_words
bi_words[2577:2587]
## [1] "make bitches" "make different" "make just"
## [4] "make quick" "make scene" "make sound"
## [7] "make swing" "make think" "makes cocksucking"
## [10] "makes cry" "makes fat"
# Plot a wordcloud
wordcloud(bi_words, freq, max.words = 15)
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
return(corpus)
}
coffee_clean_corpus <- clean_corpus(coffee_corpus)
# Create tdm and matrix with normal weighting
tf_tdm <- TermDocumentMatrix(coffee_clean_corpus)
tf_tdm_m <- as.matrix(tf_tdm)
tf_tdm_m[510:520, 5:10]
## Docs
## Terms 5 6 7 8 9 10
## cocktail 0 1 0 0 0 0
## cocoa 0 0 0 0 0 0
## cocobear20 0 0 0 0 0 0
## coconut 0 0 0 0 0 0
## codagogy 0 0 0 0 0 0
## codealan 0 0 0 0 0 0
## coffee 1 1 1 1 1 1
## coffeeaddict 0 0 0 0 0 0
## coffeeboy25 0 0 0 0 0 0
## coffeebreakfast 0 0 0 0 0 0
## coffeecrawl 0 0 0 0 0 0
# Create tdm and matrix with tf-idf weighting
tf_idf_tdm <- TermDocumentMatrix(
coffee_clean_corpus,
control = list(weighting = weightTfIdf)
)
tf_idf_tdm_m <- as.matrix(tf_idf_tdm)
tf_idf_tdm_m[510:520, 5:10]
## Docs
## Terms 5 6 7 8 9
## cocktail 0.000000000 1.66096405 0.000000000 0.00000000 0.00000000
## cocoa 0.000000000 0.00000000 0.000000000 0.00000000 0.00000000
## cocobear20 0.000000000 0.00000000 0.000000000 0.00000000 0.00000000
## coconut 0.000000000 0.00000000 0.000000000 0.00000000 0.00000000
## codagogy 0.000000000 0.00000000 0.000000000 0.00000000 0.00000000
## codealan 0.000000000 0.00000000 0.000000000 0.00000000 0.00000000
## coffee 0.009237915 0.01385687 0.007558294 0.02771375 0.02078531
## coffeeaddict 0.000000000 0.00000000 0.000000000 0.00000000 0.00000000
## coffeeboy25 0.000000000 0.00000000 0.000000000 0.00000000 0.00000000
## coffeebreakfast 0.000000000 0.00000000 0.000000000 0.00000000 0.00000000
## coffeecrawl 0.000000000 0.00000000 0.000000000 0.00000000 0.00000000
## Docs
## Terms 10
## cocktail 0.00000000
## cocoa 0.00000000
## cocobear20 0.00000000
## coconut 0.00000000
## codagogy 0.00000000
## codealan 0.00000000
## coffee 0.02078531
## coffeeaddict 0.00000000
## coffeeboy25 0.00000000
## coffeebreakfast 0.00000000
## coffeecrawl 0.00000000
# Create mapping to metadata
custom_reader <- readTabular(
mapping = list(
content = "text",
id = "num",
author = "screenName",
date = "created"))
# Create VCorpus including metadata
text_corpus <- VCorpus(
DataframeSource(tweets),
readerControl = list(reader = custom_reader)
)
text_corpus <- clean_corpus(text_corpus)
text_corpus[[1]][1]
## $content
## [1] "ayyytylerb true drink lots coffee"
text_corpus[[1]][2]
## $meta
## id : 1
## author : thejennagibson
## date : 8/9/2013 2:43
## language: en
amzn <- read.csv('https://assets.datacamp.com/production/course_935/datasets/500_amzn.csv')
goog <- read.csv('https://assets.datacamp.com/production/course_935/datasets/500_goog.csv')
# Print the structure of amzn
str(amzn)
## 'data.frame': 500 obs. of 4 variables:
## $ pg_num: int 50 50 50 50 50 50 50 50 50 50 ...
## $ url : Factor w/ 58 levels "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P10.htm",..: 44 44 44 44 44 44 44 44 44 44 ...
## $ pros : Factor w/ 496 levels "- Learn a lot, haven't been bored yet.",..: 492 56 152 349 359 367 183 417 210 352 ...
## $ cons : Factor w/ 495 levels "*Depending on your manager, might work long hours",..: 156 276 246 89 288 187 374 212 112 160 ...
# Create amzn_pros
amzn_pros <- amzn$pros
# Create amzn_cons
amzn_cons <- amzn$cons
# Print the structure of goog
str(goog)
## 'data.frame': 501 obs. of 4 variables:
## $ pg_num: int 1 1 1 1 1 1 1 1 1 1 ...
## $ url : Factor w/ 50 levels "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ pros : Factor w/ 492 levels "- Access to a vast wealth of technical resources and people",..: 21 27 488 12 410 233 413 376 314 384 ...
## $ cons : Factor w/ 491 levels "- Bureaucracy, politics, legal issues, and privacy handling take up more and more time over the years and slow "| __truncated__,..: 18 26 176 6 296 62 453 447 186 113 ...
# Create goog_pros
goog_pros <- goog$pros
# Create goog_cons
goog_cons <- goog$cons
# Clean with qdap
qdap_clean <- function(x) {
x <- replace_abbreviation(x)
x <- replace_contraction(x)
x <- replace_number(x)
x <- replace_ordinal(x)
x <- replace_symbol(x)
x <- tolower(x)
return(x)
}
# Clean with tm
tm_clean <- function(corpus) {
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "Google", "Amazon", "company"))
return(corpus)
}
# Alter amzn_pros
amzn_pros <- qdap_clean(amzn_pros)
# Alter amzn_cons
amzn_cons <- qdap_clean(amzn_cons)
# I need to remove NAs before creating the corpus so the RWeka tokenizer will work later
# maybe this should be done in a cleaning funtion. Just seeing if this works for now.
amzn_pros[which(is.na(amzn_pros))] <- "NULL"
amzn_cons[which(is.na(amzn_cons))] <- "NULL"
# Create az_p_corp
az_p_corp <- VCorpus(VectorSource(amzn_pros))
# Create az_c_corp
az_c_corp <- VCorpus(VectorSource(amzn_cons))
# Create amzn_pros_corp
amzn_pros_corp <- tm_clean(az_p_corp)
# Create amzn_cons_corp
amzn_cons_corp <- tm_clean(az_c_corp)
# Apply qdap_clean to goog_pros
goog_pros <- qdap_clean(goog_pros)
# Apply qdap_clean to goog_cons
goog_cons <- qdap_clean(goog_cons)
# remove NAs
goog_pros[which(is.na(goog_pros))] <- "NULL"
goog_cons[which(is.na(goog_cons))] <- "NULL"
# Create goog_p_corp
goog_p_corp <- VCorpus(VectorSource(goog_pros))
# Create goog_c_corp
goog_c_corp <- VCorpus(VectorSource(goog_cons))
# Create goog_pros_corp
goog_pros_corp <- tm_clean(goog_p_corp)
# Create goog_cons_corp
goog_cons_corp <- tm_clean(goog_c_corp)
# Create a tokenizer
tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
# Create amzn_p_tdm
amzn_p_tdm <- TermDocumentMatrix(
amzn_pros_corp,
control = list(tokenize = tokenizer)
)
# Create amzn_p_tdm_m
amzn_p_tdm_m <- as.matrix(amzn_p_tdm)
# Create amzn_p_freq
amzn_p_freq <- rowSums(amzn_p_tdm_m)
# Plot a wordcloud using amzn_p_freq values
wordcloud(names(amzn_p_freq),
amzn_p_freq,
max.words = 25,
color = "blue")
# Create amzn_c_tdm
amzn_c_tdm <- TermDocumentMatrix(
amzn_cons_corp,
control = list(tokenize = tokenizer))
# Create amzn_c_tdm_m
amzn_c_tdm_m <- as.matrix(amzn_c_tdm)
# Create amzn_c_freq
amzn_c_freq <- rowSums(amzn_c_tdm_m)
# Plot a wordcloud of negative Amazon bigrams
wordcloud(names(amzn_c_freq), amzn_c_freq,
max.words = 25, color = "red")
# Create amzn_c_tdm
amzn_c_tdm <- TermDocumentMatrix(
amzn_cons_corp,
control = list(tokenize = tokenizer))
# Print amzn_c_tdm to the console
amzn_c_tdm
## <<TermDocumentMatrix (terms: 4778, documents: 500)>>
## Non-/sparse entries: 5220/2383780
## Sparsity : 100%
## Maximal term length: 31
## Weighting : term frequency (tf)
# Create amzn_c_tdm2 by removing sparse terms
amzn_c_tdm2 <- removeSparseTerms(amzn_c_tdm, sparse = .993)
# Create hc as a cluster of distance values
hc <- hclust(
d = dist(amzn_c_tdm2, method = "euclidean"),
method = "complete")
# Produce a plot of hc
plot(hc)
# Create amzn_p_tdm
amzn_p_tdm <- TermDocumentMatrix(
amzn_pros_corp,
control = list(tokenize = tokenizer)
)
# Create amzn_p_m
amzn_p_m <- as.matrix(amzn_p_tdm)
# Create amzn_p_freq
amzn_p_freq <- rowSums(amzn_p_m)
# Create term_frequency
term_frequency <- sort(amzn_p_freq, decreasing = T)
# Print the 5 most common terms
term_frequency[1:5]
## good pay great benefits smart people place work fast paced
## 25 24 20 17 16
# Find associations with fast paced
associations <- findAssocs(amzn_p_tdm, "fast paced", 0.2)
head(associations$`fast paced`, 20)
## paced environment environments ever learn fast
## 0.49 0.35 0.35
## paced friendly paced work able excel
## 0.35 0.35 0.25
## activity ample advance one also well
## 0.25 0.25 0.25
## amazon fast amazon noting amazon one
## 0.25 0.25 0.25
## amount time ample opportunity assistance ninety
## 0.25 0.25 0.25
## benefits including break computer call activity
## 0.25 0.25 0.25
## can choose catchy cheers
## 0.25 0.25
goog
dataset# Create goog_pros
str(goog)
## 'data.frame': 501 obs. of 4 variables:
## $ pg_num: int 1 1 1 1 1 1 1 1 1 1 ...
## $ url : Factor w/ 50 levels "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ pros : Factor w/ 492 levels "- Access to a vast wealth of technical resources and people",..: 21 27 488 12 410 233 413 376 314 384 ...
## $ cons : Factor w/ 491 levels "- Bureaucracy, politics, legal issues, and privacy handling take up more and more time over the years and slow "| __truncated__,..: 18 26 176 6 296 62 453 447 186 113 ...
all_goog_corpus <- VCorpus(VectorSource(goog[,3:4]))
str(all_goog_corpus)
## List of 2
## $ 1:List of 2
## ..$ content: chr [1:501] "* If you're a software engineer, you're among the kings of the hill at Google. It's an engineer-driven company "| __truncated__ "1) Food, food, food. 15+ cafes on main campus (MTV) alone. Mini-kitchens, snacks, drinks, free breakfast/lunch/"| __truncated__ "You can't find a more well-regarded company that actually deserves the hype it gets." "- You drive yourself here. If you want to grow, you have to seek out opportunities and prove that your worth. T"| __truncated__ ...
## ..$ meta :List of 7
## .. ..$ author : chr(0)
## .. ..$ datetimestamp: POSIXlt[1:1], format: "2017-10-09 21:20:04"
## .. ..$ description : chr(0)
## .. ..$ heading : chr(0)
## .. ..$ id : chr "1"
## .. ..$ language : chr "en"
## .. ..$ origin : chr(0)
## .. ..- attr(*, "class")= chr "TextDocumentMeta"
## ..- attr(*, "class")= chr [1:2] "PlainTextDocument" "TextDocument"
## $ 2:List of 2
## ..$ content: chr [1:501] "* It *is* becoming larger, and with it comes growing pains: bureaucracy, slow to respond to market threats, blo"| __truncated__ "1) Work/life balance. What balance? All those perks and benefits are an illusion. They keep you at work and the"| __truncated__ "I live in SF so the commute can take between 1.5 hours to 1.75 hours each way on the shuttle - sometimes 2 hour"| __truncated__ "- Google is a big company. So there are going to be winners and losers when it comes to career growth. Due to t"| __truncated__ ...
## ..$ meta :List of 7
## .. ..$ author : chr(0)
## .. ..$ datetimestamp: POSIXlt[1:1], format: "2017-10-09 21:20:04"
## .. ..$ description : chr(0)
## .. ..$ heading : chr(0)
## .. ..$ id : chr "2"
## .. ..$ language : chr "en"
## .. ..$ origin : chr(0)
## .. ..- attr(*, "class")= chr "TextDocumentMeta"
## ..- attr(*, "class")= chr [1:2] "PlainTextDocument" "TextDocument"
## - attr(*, "class")= chr [1:2] "VCorpus" "Corpus"
# Create all_goog_corp
all_goog_corp <- tm_clean(all_goog_corpus)
# Create all_tdm
all_tdm <- TermDocumentMatrix(all_goog_corp)
# Name the columns of all_tdm
colnames(all_tdm) <- c("Goog_Pros", "Goog_Cons")
# Create all_m
all_m <- as.matrix(all_tdm)
head(all_m)
## Docs
## Terms Goog_Pros Goog_Cons
## 100 1 0
## 1000 0 1
## 100k 0 1
## 106 1 0
## 175 0 1
## 200 0 1
# Build a comparison cloud
comparison.cloud(all_m,
colors = c("#F44336", "#2196f3"),
max.words = 100)
all_tdm_m
before I can use the exercise codestr(goog)
## 'data.frame': 501 obs. of 4 variables:
## $ pg_num: int 1 1 1 1 1 1 1 1 1 1 ...
## $ url : Factor w/ 50 levels "https://www.glassdoor.com/Reviews/Google-Reviews-E9079_P1.htm",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ pros : Factor w/ 492 levels "- Access to a vast wealth of technical resources and people",..: 21 27 488 12 410 233 413 376 314 384 ...
## $ cons : Factor w/ 491 levels "- Bureaucracy, politics, legal issues, and privacy handling take up more and more time over the years and slow "| __truncated__,..: 18 26 176 6 296 62 453 447 186 113 ...
str(amzn)
## 'data.frame': 500 obs. of 4 variables:
## $ pg_num: int 50 50 50 50 50 50 50 50 50 50 ...
## $ url : Factor w/ 58 levels "https://www.glassdoor.com/Reviews/Amazon-com-Reviews-E6036_P10.htm",..: 44 44 44 44 44 44 44 44 44 44 ...
## $ pros : Factor w/ 496 levels "- Learn a lot, haven't been bored yet.",..: 492 56 152 349 359 367 183 417 210 352 ...
## $ cons : Factor w/ 495 levels "*Depending on your manager, might work long hours",..: 156 276 246 89 288 187 374 212 112 160 ...
# create a data frame of just the positive revies
# google is longer by one so here I simply add a null to amazon reviews to get same length
pros <- data.frame(
amzn_pros = c(as.character(amzn$pros), 'NULL'),
goog_pros = as.character(goog$pros),
stringsAsFactors = F
)
pros[is.na(pros)] <- "NULL"
pros <- qdap_clean(pros)
# Create a corpus
all_pros_corp <- VCorpus(VectorSource(pros))
all_pros_corp <- tm_clean(all_pros_corp)
# Create a tdm with bi-grams
all_tdm <- TermDocumentMatrix(
all_pros_corp,
control = list(tokenize = tokenizer)
)
# Create amzn_p_m
all_tdm_m <- as.matrix(all_tdm)
head(all_tdm_m)
## Docs
## Terms 1 2
## ability customer 1 0
## ability iterate 1 0
## ability make 1 1
## ability see 1 0
## ability switch 0 1
## ability travel 0 1
# Create common_words
common_words <- subset(all_tdm_m,
all_tdm_m[, 1] > 0 & all_tdm_m[, 2] > 0)
# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])
# Add difference to common_words
common_words <- cbind(common_words, difference)
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[,3], decreasing = T), ]
# Create top15_df
top15_df <- data.frame(
x = common_words[1:15, 1],
y = common_words[1:15, 2],
labels = rownames(common_words[1:15, ])
)
# Create the pyramid plot
pyramid.plot(top15_df$x, top15_df$y,
labels = top15_df$labels,
gap = 12,
top.labels = c("Amzn", "Pro Words", "Google"),
main = "Words in Common", unit = NULL)
## [1] 5.1 4.1 4.1 2.1
# create a data frame of just the positive revies
# google is longer by one so here I simply add a null to amazon reviews to get same length
cons <- data.frame(
amzn_cons = c(as.character(amzn$cons), 'NULL'),
goog_cons = as.character(goog$cons),
stringsAsFactors = F
)
cons[is.na(cons)] <- "NULL"
cons <- qdap_clean(cons)
# Create a corpus
all_cons_corp <- VCorpus(VectorSource(cons))
all_cons_corp <- tm_clean(all_cons_corp)
# Create a tdm with bi-grams
all_tdm <- TermDocumentMatrix(
all_cons_corp,
control = list(tokenize = tokenizer)
)
# Create amzn_p_m
all_tdm_m <- as.matrix(all_tdm)
# Create common_words
common_words <- subset(
all_tdm_m,
all_tdm_m[, 1] > 0 & all_tdm_m[, 2] > 0
)
# Create difference
difference <- abs(common_words[, 1] - common_words[, 2])
# Bind difference to common_words
common_words <- cbind(common_words, difference)
# Order the data frame from most differences to least
common_words <- common_words[order(common_words[, 3], decreasing = T), ]
# Create top15_df
top15_df <- data.frame(x = common_words[1:15, 1],
y = common_words[1:15, 2],
labels = rownames(common_words[1:15, ]))
# Create the pyramid plot
pyramid.plot(top15_df$x, top15_df$y,
labels = top15_df$labels,
main = "Words in Common",
top.labels = c("Amzn",
"Cons Words",
"Google"),
gap = 12,
unit = NULL
)
## [1] 5.1 4.1 4.1 2.1