Loading packages
## ── Attaching packages ─────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.1
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Importing review data
review_data <- read_csv("Roomba Reviews.csv")
## Parsed with column specification:
## cols(
## date = col_character(),
## product = col_character(),
## stars = col_double(),
## title = col_character(),
## review = col_character()
## )
## # A tibble: 1,833 x 5
## date product stars title review
## <chr> <chr> <dbl> <chr> <chr>
## 1 2/28/15 iRobot Roomba… 5 Five Stars "You would not believe how w…
## 2 1/12/15 iRobot Roomba… 4 Four Stars "You just walk away and it d…
## 3 12/26/… iRobot Roomba… 5 Awesome love it. "You have to Roomba proof yo…
## 4 8/4/13 iRobot Roomba… 3 Love-hate this va… "Yes, it's a fascinating, al…
## 5 12/22/… iRobot Roomba… 5 This vacuum is fa… "Years ago I bought one of t…
## 6 12/27/… iRobot Roomba… 5 Wow! "Wow.Wow. I never knew my f…
## 7 8/17/15 iRobot Roomba… 1 Terrible Product … "Wow.. I don't know what to …
## 8 12/28/… iRobot Roomba… 5 Super-impressed b… "Wow, wow, WOW! I wanted to…
## 9 1/19/14 iRobot Roomba… 5 LOVE THIS "Wow, the Roomba is the best…
## 10 7/2/15 iRobot Roomba… 5 Stress is bad; Ro… "Wow, it changes your life. …
## # … with 1,823 more rows
Using filter() and summarize()
review_data %>%
filter(product == "iRobot Roomba 650 for Pets") %>%
summarize(stars_mean = mean(stars))
## # A tibble: 1 x 1
## stars_mean
## <dbl>
## 1 4.49
review_data %>%
group_by(product) %>%
summarize(stars_mean = mean(stars))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## product stars_mean
## <chr> <dbl>
## 1 iRobot Roomba 650 for Pets 4.49
## 2 iRobot Roomba 880 for Pets and Allergies 4.42
review_data %>%
group_by(product) %>%
summarize(review_mean = mean(review))
## Warning in mean.default(review): argument is not numeric or logical: returning
## NA
## Warning in mean.default(review): argument is not numeric or logical: returning
## NA
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## product review_mean
## <chr> <dbl>
## 1 iRobot Roomba 650 for Pets NA
## 2 iRobot Roomba 880 for Pets and Allergies NA
# Load the tidyverse packages
# Print twitter_data
## # A tibble: 7,044 x 6
## tweet_id date complaint_label tweet_text usr_followers_c…
## <dbl> <dttm> <chr> <chr> <dbl>
## 1 4.77e17 2014-06-12 00:07:25 Non-Complaint 1. Haneda… 152
## 2 4.77e17 2014-06-12 00:12:30 Non-Complaint My plane … 184
## 3 4.77e17 2014-06-12 00:13:56 Complaint So appare… 136
## 4 4.77e17 2014-06-12 00:16:09 Non-Complaint Je suppor… 1
## 5 4.77e17 2014-06-12 00:17:37 Non-Complaint Dear @Ceb… 67
## 6 4.77e17 2014-06-12 00:18:49 Complaint Boo @Delt… 138
## 7 4.77e17 2014-06-12 00:26:42 Non-Complaint #PALFlies… 21
## 8 4.77e17 2014-06-12 00:31:08 Complaint @JetBlue … 133
## 9 4.77e17 2014-06-12 00:35:27 Non-Complaint Celebrati… 607
## 10 4.77e17 2014-06-12 00:46:47 Non-Complaint Don't do … 165
## # … with 7,034 more rows, and 1 more variable: usr_verified <lgl>
# Print just the complaints in twitter_data
twitter_data %>%
filter(complaint_label == "Complaint")
## # A tibble: 1,676 x 6
## tweet_id date complaint_label tweet_text usr_followers_c…
## <dbl> <dttm> <chr> <chr> <dbl>
## 1 4.77e17 2014-06-12 00:13:56 Complaint So appare… 136
## 2 4.77e17 2014-06-12 00:18:49 Complaint Boo @Delt… 138
## 3 4.77e17 2014-06-12 00:31:08 Complaint @JetBlue … 133
## 4 4.77e17 2014-06-12 00:49:18 Complaint @TheRealK… 221
## 5 4.77e17 2014-06-12 00:54:32 Complaint @American… 10
## 6 4.77e17 2014-06-12 00:58:36 Complaint I strongl… 158
## 7 4.77e17 2014-06-12 01:08:40 Complaint @doncliff… 55
## 8 4.77e17 2014-06-12 01:27:36 Complaint @USAirway… 995
## 9 4.77e17 2014-06-12 02:17:21 Complaint Just aske… 7005
## 10 4.77e17 2014-06-12 02:18:16 Complaint @migs647 … 919
## # … with 1,666 more rows, and 1 more variable: usr_verified <lgl>
# Start with the data frame
twitter_data %>%
# Group the whether or not the tweet is a complaint
group_by(complaint_label) %>%
# Compute the mean, min, and max follower counts
avg_followers = mean(usr_followers_count),
min_followers = min(usr_followers_count),
max_followers = max(usr_followers_count)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
## complaint_label avg_followers min_followers max_followers
## <chr> <dbl> <dbl> <dbl>
## 1 Complaint 3234. 0 1259803
## 2 Non-Complaint 4487. 0 2200851
Column types
## # A tibble: 1,833 x 5
## date product stars title review
## <chr> <chr> <dbl> <chr> <chr>
## 1 2/28/15 iRobot Roomba… 5 Five Stars "You would not believe how w…
## 2 1/12/15 iRobot Roomba… 4 Four Stars "You just walk away and it d…
## 3 12/26/… iRobot Roomba… 5 Awesome love it. "You have to Roomba proof yo…
## 4 8/4/13 iRobot Roomba… 3 Love-hate this va… "Yes, it's a fascinating, al…
## 5 12/22/… iRobot Roomba… 5 This vacuum is fa… "Years ago I bought one of t…
## 6 12/27/… iRobot Roomba… 5 Wow! "Wow.Wow. I never knew my f…
## 7 8/17/15 iRobot Roomba… 1 Terrible Product … "Wow.. I don't know what to …
## 8 12/28/… iRobot Roomba… 5 Super-impressed b… "Wow, wow, WOW! I wanted to…
## 9 1/19/14 iRobot Roomba… 5 LOVE THIS "Wow, the Roomba is the best…
## 10 7/2/15 iRobot Roomba… 5 Stress is bad; Ro… "Wow, it changes your life. …
## # … with 1,823 more rows
Summarize with n()
review_data %>%
summarize(number_rows = n())
## # A tibble: 1 x 1
## number_rows
## <int>
## 1 1833
review_data %>%
group_by(product) %>%
summarize(number_rows = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## product number_rows
## <chr> <int>
## 1 iRobot Roomba 650 for Pets 633
## 2 iRobot Roomba 880 for Pets and Allergies 1200
Summarize with count()
review_data %>%
## # A tibble: 2 x 2
## product n
## <chr> <int>
## 1 iRobot Roomba 650 for Pets 633
## 2 iRobot Roomba 880 for Pets and Allergies 1200
review_data %>%
count(product) %>%
## # A tibble: 2 x 2
## product n
## <chr> <int>
## 1 iRobot Roomba 880 for Pets and Allergies 1200
## 2 iRobot Roomba 650 for Pets 633
# Load the tidyverse package
twitter_data %>%
# Filter for just the complaints
filter(complaint_label == "Complaint") %>%
# Count the number of verified and non-verified users
## # A tibble: 2 x 2
## usr_verified n
## <lgl> <int>
## 1 FALSE 1650
## 2 TRUE 26
twitter_data %>%
# Group by whether or not a user is verified
group_by(usr_verified) %>%
# Compute the average number of followers
avg_followers = mean(usr_followers_count),
# Count the number of users in each category
n = n()
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## usr_verified avg_followers n
## <lgl> <dbl> <int>
## 1 FALSE 1999. 6927
## 2 TRUE 133849. 117
Tokenizing Text
Using unnest_tokens()
tidy_review <- review_data %>%
unnest_tokens(word, review)
## # A tibble: 229,481 x 5
## date product stars title word
## <chr> <chr> <dbl> <chr> <chr>
## 1 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars you
## 2 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars would
## 3 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars not
## 4 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars believe
## 5 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars how
## 6 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars well
## 7 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars this
## 8 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars works
## 9 1/12/15 iRobot Roomba 650 for Pets 4 Four Stars you
## 10 1/12/15 iRobot Roomba 650 for Pets 4 Four Stars just
## # … with 229,471 more rows
Counting words
tidy_review %>%
count(word) %>%
## # A tibble: 10,310 x 2
## word n
## <chr> <int>
## 1 the 11785
## 2 it 7905
## 3 and 6794
## 4 to 6440
## 5 i 6034
## 6 a 5884
## 7 is 3347
## 8 of 3229
## 9 have 2470
## 10 that 2410
## # … with 10,300 more rows
Using anti_join()
tidy_review2 <- review_data %>%
unnest_tokens(word, review) %>%
## Joining, by = "word"
## # A tibble: 78,868 x 5
## date product stars title word
## <chr> <chr> <dbl> <chr> <chr>
## 1 1/12/15 iRobot Roomba 650 for Pets 4 Four Stars walk
## 2 1/12/15 iRobot Roomba 650 for Pets 4 Four Stars rest
## 3 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. roomba
## 4 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. proof
## 5 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. house
## 6 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. awesome
## 7 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. pet
## 8 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. cleans
## 9 8/4/13 iRobot Roomba 650 for Pets 3 Love-hate this vaccuum fascinating
## 10 8/4/13 iRobot Roomba 650 for Pets 3 Love-hate this vaccuum albeit
## # … with 78,858 more rows
Counting words again
tidy_review2 %>%
count(word) %>%
## # A tibble: 9,672 x 2
## word n
## <chr> <int>
## 1 roomba 2286
## 2 clean 1204
## 3 vacuum 989
## 4 hair 900
## 5 cleaning 809
## 6 time 795
## 7 house 745
## 8 floors 657
## 9 day 578
## 10 floor 561
## # … with 9,662 more rows
# Load the tidyverse and tidytext packages
tidy_twitter <- twitter_data %>%
# Tokenize the twitter data
unnest_tokens(word, tweet_text)
tidy_twitter %>%
# Compute word counts
count(word) %>%
# Arrange the counts in descending order
## # A tibble: 18,600 x 2
## word n
## <chr> <int>
## 1 to 2834
## 2 the 2212
## 3 a 1989
## 4 i 1752
## 5 t.co 1405
## 6 http 1361
## 7 for 1356
## 8 you 1345
## 9 on 1289
## 10 and 1153
## # … with 18,590 more rows
tidy_twitter <- twitter_data %>%
# Tokenize the twitter data
unnest_tokens(word, tweet_text) %>%
# Remove stop words
## Joining, by = "word"
tidy_twitter %>%
# Filter to keep complaints only
filter(complaint_label == "Complaint") %>%
# Compute word counts and arrange in descending order
count(word) %>%
## # A tibble: 3,863 x 2
## word n
## <chr> <int>
## 1 flight 459
## 2 united 362
## 3 americanair 294
## 4 usairways 207
## 5 time 167
## 6 delta 141
## 7 service 137
## 8 2 129
## 9 delayed 123
## 10 british_airways 121
## # … with 3,853 more rows
Starting with tidy text
tidy_review <- review_data %>%
mutate(id = row_number()) %>%
unnest_tokens(word, review) %>%
## Joining, by = "word"
## # A tibble: 78,868 x 6
## date product stars title id word
## <chr> <chr> <dbl> <chr> <int> <chr>
## 1 1/12/15 iRobot Roomba 650 for P… 4 Four Stars 2 walk
## 2 1/12/15 iRobot Roomba 650 for P… 4 Four Stars 2 rest
## 3 12/26/13 iRobot Roomba 650 for P… 5 Awesome love it. 3 roomba
## 4 12/26/13 iRobot Roomba 650 for P… 5 Awesome love it. 3 proof
## 5 12/26/13 iRobot Roomba 650 for P… 5 Awesome love it. 3 house
## 6 12/26/13 iRobot Roomba 650 for P… 5 Awesome love it. 3 awesome
## 7 12/26/13 iRobot Roomba 650 for P… 5 Awesome love it. 3 pet
## 8 12/26/13 iRobot Roomba 650 for P… 5 Awesome love it. 3 cleans
## 9 8/4/13 iRobot Roomba 650 for P… 3 Love-hate this vacc… 4 fascinati…
## 10 8/4/13 iRobot Roomba 650 for P… 3 Love-hate this vacc… 4 albeit
## # … with 78,858 more rows
Visualizing count with geom_col()
word_counts <- tidy_review %>%
count(word) %>%
word_counts, aes(x = word, y = n)
) + geom_col()
filter() before visualizing
word_counts2 <- tidy_review %>%
count(word) %>%
filter(n > 300) %>%
## # A tibble: 25 x 2
## word n
## <chr> <int>
## 1 roomba 2286
## 2 clean 1204
## 3 vacuum 989
## 4 hair 900
## 5 cleaning 809
## 6 time 795
## 7 house 745
## 8 floors 657
## 9 day 578
## 10 floor 561
## # … with 15 more rows
And flip coordinates
ggplot(word_counts2, aes(x = word, y = n)) +
geom_col() +
coord_flip() +
ggtitle("Review Word Counts")
word_counts <- tidy_twitter %>%
filter(complaint_label == "Complaint") %>%
count(word) %>%
# Keep words with count greater than 100
filter(n > 100)
# Create a bar plot using word_counts with x = word
ggplot(word_counts, aes(x = word, y = n)) +
geom_col() +
# Flip the plot coordinates
word_counts <- tidy_twitter %>%
# Only keep the non-complaints
filter(complaint_label == "Non-Complaint") %>%
count(word) %>%
filter(n > 150)
# Create a bar plot using the new word_counts
ggplot(word_counts, aes(x = word, y = n)) +
geom_col() +
coord_flip() +
# Title the plot "Non-Complaint Word Counts"
ggtitle("Non-Complaint Word Counts")
Custom stop words
Removing stop words again
tidy_review <- review_data %>%
mutate(id = row_number()) %>%
select(id, date, product, stars, review) %>%
unnest_tokens(word, review) %>%
## Joining, by = "word"
tidy_review %>%
filter(word == "roomba")
## # A tibble: 0 x 5
## # … with 5 variables: id <int>, date <chr>, product <chr>, stars <dbl>,
## # word <chr>
Using fct_reorder()
word_counts <- tidy_review %>%
count(word) %>%
filter(n > 300) %>%
mutate(word2 = fct_reorder(word, n))
## # A tibble: 23 x 3
## word n word2
## <chr> <int> <fct>
## 1 880 525 880
## 2 bin 428 bin
## 3 carpet 368 carpet
## 4 clean 1204 clean
## 5 cleaning 809 cleaning
## 6 day 578 day
## 7 dirt 384 dirt
## 8 dog 407 dog
## 9 dust 543 dust
## 10 floor 561 floor
## # … with 13 more rows
Arranging the bar plot
ggplot(word_counts, aes(x = word2, y = n)) +
geom_col() +
coord_flip() +
ggtitle("Review Word Counts")
custom_stop_words <- tribble(
# Column names should match stop_words
~word, ~lexicon,
# Add http, win, and t.co as custom stop words
"http", "CUSTOM",
"win", "CUSTOM",
"t.co", "CUSTOM"
# Bind the custom stop words to stop_words
stop_words2 <- stop_words %>%
word_counts <- tidy_twitter %>%
filter(complaint_label == "Non-Complaint") %>%
count(word) %>%
# Keep terms that occur more than 100 times
filter(n > 100) %>%
# Reorder word as an ordered factor by word counts
mutate(word2 = fct_reorder(word, n))
# Plot the new word column with type factor
ggplot(word_counts, aes(x = word2, y = n)) +
geom_col() +
coord_flip() +
ggtitle("Non-Complaint Word Counts")
Counting by product
tidy_review %>%
count(word, product) %>%
## # A tibble: 12,719 x 3
## word product n
## <chr> <chr> <int>
## 1 clean iRobot Roomba 880 for Pets and Allergies 815
## 2 vacuum iRobot Roomba 880 for Pets and Allergies 678
## 3 hair iRobot Roomba 880 for Pets and Allergies 595
## 4 cleaning iRobot Roomba 880 for Pets and Allergies 560
## 5 880 iRobot Roomba 880 for Pets and Allergies 518
## 6 house iRobot Roomba 880 for Pets and Allergies 494
## 7 time iRobot Roomba 880 for Pets and Allergies 494
## 8 floors iRobot Roomba 880 for Pets and Allergies 405
## 9 love iRobot Roomba 880 for Pets and Allergies 403
## 10 dust iRobot Roomba 880 for Pets and Allergies 399
## # … with 12,709 more rows
Using top_n()
tidy_review %>%
count(word, product) %>%
group_by(product) %>%
top_n(10, n)
## # A tibble: 20 x 3
## # Groups: product [2]
## word product n
## <chr> <chr> <int>
## 1 880 iRobot Roomba 880 for Pets and Allergies 518
## 2 clean iRobot Roomba 650 for Pets 389
## 3 clean iRobot Roomba 880 for Pets and Allergies 815
## 4 cleaning iRobot Roomba 650 for Pets 249
## 5 cleaning iRobot Roomba 880 for Pets and Allergies 560
## 6 day iRobot Roomba 650 for Pets 209
## 7 dust iRobot Roomba 880 for Pets and Allergies 399
## 8 floor iRobot Roomba 650 for Pets 207
## 9 floors iRobot Roomba 650 for Pets 252
## 10 floors iRobot Roomba 880 for Pets and Allergies 405
## 11 hair iRobot Roomba 650 for Pets 305
## 12 hair iRobot Roomba 880 for Pets and Allergies 595
## 13 house iRobot Roomba 650 for Pets 251
## 14 house iRobot Roomba 880 for Pets and Allergies 494
## 15 love iRobot Roomba 880 for Pets and Allergies 403
## 16 run iRobot Roomba 650 for Pets 180
## 17 time iRobot Roomba 650 for Pets 301
## 18 time iRobot Roomba 880 for Pets and Allergies 494
## 19 vacuum iRobot Roomba 650 for Pets 311
## 20 vacuum iRobot Roomba 880 for Pets and Allergies 678
Using ungroup()
tidy_review %>%
count(word, product) %>%
group_by(product) %>%
top_n(10, n) %>%
## # A tibble: 20 x 3
## word product n
## <chr> <chr> <int>
## 1 880 iRobot Roomba 880 for Pets and Allergies 518
## 2 clean iRobot Roomba 650 for Pets 389
## 3 clean iRobot Roomba 880 for Pets and Allergies 815
## 4 cleaning iRobot Roomba 650 for Pets 249
## 5 cleaning iRobot Roomba 880 for Pets and Allergies 560
## 6 day iRobot Roomba 650 for Pets 209
## 7 dust iRobot Roomba 880 for Pets and Allergies 399
## 8 floor iRobot Roomba 650 for Pets 207
## 9 floors iRobot Roomba 650 for Pets 252
## 10 floors iRobot Roomba 880 for Pets and Allergies 405
## 11 hair iRobot Roomba 650 for Pets 305
## 12 hair iRobot Roomba 880 for Pets and Allergies 595
## 13 house iRobot Roomba 650 for Pets 251
## 14 house iRobot Roomba 880 for Pets and Allergies 494
## 15 love iRobot Roomba 880 for Pets and Allergies 403
## 16 run iRobot Roomba 650 for Pets 180
## 17 time iRobot Roomba 650 for Pets 301
## 18 time iRobot Roomba 880 for Pets and Allergies 494
## 19 vacuum iRobot Roomba 650 for Pets 311
## 20 vacuum iRobot Roomba 880 for Pets and Allergies 678
Using fct_reorder()
tidy_review %>%
count(word, product) %>%
group_by(product) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word2 = fct_reorder(word, n))
## # A tibble: 20 x 4
## word product n word2
## <chr> <chr> <int> <fct>
## 1 880 iRobot Roomba 880 for Pets and Allergies 518 880
## 2 clean iRobot Roomba 650 for Pets 389 clean
## 3 clean iRobot Roomba 880 for Pets and Allergies 815 clean
## 4 cleaning iRobot Roomba 650 for Pets 249 cleaning
## 5 cleaning iRobot Roomba 880 for Pets and Allergies 560 cleaning
## 6 day iRobot Roomba 650 for Pets 209 day
## 7 dust iRobot Roomba 880 for Pets and Allergies 399 dust
## 8 floor iRobot Roomba 650 for Pets 207 floor
## 9 floors iRobot Roomba 650 for Pets 252 floors
## 10 floors iRobot Roomba 880 for Pets and Allergies 405 floors
## 11 hair iRobot Roomba 650 for Pets 305 hair
## 12 hair iRobot Roomba 880 for Pets and Allergies 595 hair
## 13 house iRobot Roomba 650 for Pets 251 house
## 14 house iRobot Roomba 880 for Pets and Allergies 494 house
## 15 love iRobot Roomba 880 for Pets and Allergies 403 love
## 16 run iRobot Roomba 650 for Pets 180 run
## 17 time iRobot Roomba 650 for Pets 301 time
## 18 time iRobot Roomba 880 for Pets and Allergies 494 time
## 19 vacuum iRobot Roomba 650 for Pets 311 vacuum
## 20 vacuum iRobot Roomba 880 for Pets and Allergies 678 vacuum
Using facet_wrap()
# ggplot(word_counts, aes(x = word2, y = n, fill = product)) +
# geom_col(show.legend = FALSE) +
# facet_wrap(~ product, scales = "free_y") +
# coord_flip() +
# ggtitle("Review Word Counts")
word_counts <- tidy_twitter %>%
# Count words by whether or not its a complaint
count(word, complaint_label) %>%
# Group by whether or not its a complaint
group_by(complaint_label) %>%
# Keep the top 20 words
top_n(20, n) %>%
# Ungroup before reordering word as a factor by the count
ungroup() %>%
mutate(word2 = fct_reorder(word, n))
# Include a color aesthetic tied to whether or not its a complaint
ggplot(word_counts, aes(x = word2, y = n, fill = complaint_label)) +
# Don't include the lengend for the column plot
geom_col(show.legend = FALSE) +
# Facet by whether or not its a complaint and make the y-axis free
facet_wrap(~ complaint_label, scales = "free_y") +
# Flip the coordinates and add a title: "Twitter Word Counts"
coord_flip() +
ggtitle("Twitter Word Counts")
Using wordcloud()
## Loading required package: RColorBrewer
word_counts <- tidy_review %>%
words = word_counts$word,
freq = word_counts$n,
max.words = 30
Fixed size and random start points
words = word_counts$word,
freq = word_counts$n,
max.words = 30
Number of words in cloud
words = word_counts$word,
freq = word_counts$n,
max.words = 70
Using colors
words = word_counts$word,
freq = word_counts$n,
max.words = 30,
colors = "blue"
# Load the wordcloud package
# Compute word counts and assign to word_counts
word_counts <- tidy_twitter %>%
# Assign the word column to words
words = word_counts$word,
# Assign the count column to freq
freq = word_counts$n,
max.words = 30
# Compute complaint word counts and assign to word_counts
word_counts <- tidy_twitter %>%
filter(complaint_label == "Complaint") %>%
# Create a complaint word cloud of the top 50 terms, colored red
words = word_counts$word,
freq = word_counts$n,
max.words = 50,
colors = "red"
Bing dictionary
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
get_sentiments("bing") %>%
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
Afinn dictionary
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments("afinn") %>%
min = min(value),
max = max(value)
## # A tibble: 1 x 2
## min max
## <dbl> <dbl>
## 1 -5 5
Loughran dictionary
sentiment_counts <- get_sentiments("loughran") %>%
count(sentiment) %>%
mutate(sentiment2 = fct_reorder(sentiment, n))
ggplot(sentiment_counts, aes(x = sentiment2, y = n)) +
geom_col() +
coord_flip() +
title = "Sentiment Counts in Loughran",
x = "Counts",
y = "Sentiment"
# Load the tidyverse and tidytext packages
# Count the number of words associated with each sentiment in nrc
get_sentiments("nrc") %>%
count(sentiment) %>%
# Arrange the counts in descending order
## # A tibble: 10 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
## 3 fear 1476
## 4 anger 1247
## 5 trust 1231
## 6 sadness 1191
## 7 disgust 1058
## 8 anticipation 839
## 9 joy 689
## 10 surprise 534
# Pull in the nrc dictionary, count the sentiments and reorder them by count
sentiment_counts <- get_sentiments("nrc") %>%
count(sentiment) %>%
mutate(sentiment2 = fct_reorder(sentiment, n))
# Visualize sentiment_counts using the new sentiment factor column
ggplot(sentiment_counts, aes(x = sentiment2, y = n)) +
geom_col() +
coord_flip() +
# Change the title to "Sentiment Counts in NRC", x-axis to "Sentiment", and y-axis to "Counts"
title = "Sentiment Counts in NRC",
x = "Sentiment",
y = "Counts"
Using inner_join()
tidy_review %>% inner_join(get_sentiments("loughran"))
## Joining, by = "word"
## # A tibble: 3,960 x 6
## id date product stars word sentiment
## <int> <chr> <chr> <dbl> <chr> <chr>
## 1 5 12/22/15 iRobot Roomba 650 for Pets 5 slow negative
## 2 5 12/22/15 iRobot Roomba 650 for Pets 5 easily positive
## 3 5 12/22/15 iRobot Roomba 650 for Pets 5 random uncertainty
## 4 5 12/22/15 iRobot Roomba 650 for Pets 5 easy positive
## 5 5 12/22/15 iRobot Roomba 650 for Pets 5 easy positive
## 6 5 12/22/15 iRobot Roomba 650 for Pets 5 easy positive
## 7 6 12/27/15 iRobot Roomba 650 for Pets 5 invention positive
## 8 7 8/17/15 iRobot Roomba 650 for Pets 1 damage negative
## 9 7 8/17/15 iRobot Roomba 650 for Pets 1 damage negative
## 10 7 8/17/15 iRobot Roomba 650 for Pets 1 justice litigious
## # … with 3,950 more rows
Counting sentiment
sentiment_review <- tidy_review %>% inner_join(get_sentiments("loughran"))
## Joining, by = "word"
sentiment_review %>%
## # A tibble: 6 x 2
## sentiment n
## <chr> <int>
## 1 constraining 170
## 2 litigious 53
## 3 negative 1795
## 4 positive 1568
## 5 superfluous 1
## 6 uncertainty 373
sentiment_review %>%
count(word, sentiment) %>%
## # A tibble: 598 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 easy positive 297
## 2 happy positive 107
## 3 easier positive 97
## 4 easily positive 92
## 5 perfect positive 87
## 6 random uncertainty 81
## 7 impressed positive 77
## 8 excellent positive 58
## 9 trouble negative 58
## 10 fantastic positive 56
## # … with 588 more rows
Visualizing sentiment
sentiment_review2 <- sentiment_review %>%
filter(sentiment %in% c("positive", "negative"))
word_counts <- sentiment_review2 %>%
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
word2 = fct_reorder(word, n)
ggplot(word_counts, aes(x = word2, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
title = "Sentiment Word Counts",
x = "Words"
# Join tidy_twitter and the NRC sentiment dictionary
sentiment_twitter <- tidy_twitter %>%
## Joining, by = "word"
# Count the sentiments in tidy_twitter
sentiment_twitter %>%
count(sentiment) %>%
# Arrange the sentiment counts in descending order
## # A tibble: 10 x 2
## sentiment n
## <chr> <int>
## 1 positive 4415
## 2 trust 2873
## 3 negative 2747
## 4 anticipation 2124
## 5 joy 1480
## 6 sadness 1426
## 7 fear 1357
## 8 anger 1156
## 9 surprise 992
## 10 disgust 880
word_counts <- tidy_twitter %>%
# Append the NRC dictionary and filter for positive, fear, and trust
inner_join(get_sentiments("nrc")) %>%
filter(sentiment %in% c("positive", "fear", "trust")) %>%
# Count by word and sentiment and keep the top 10 of each
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
# Create a factor called word2 that has each word ordered by the count
word2 = fct_reorder(word, n)
## Joining, by = "word"
# Create a bar plot out of the word counts colored by sentiment
ggplot(word_counts, aes(x = word2, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
# Create a separate facet for each sentiment with free axes
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
# Title the plot "Sentiment Word Counts" with "Words" for the x-axis
title = "Sentiment Word Counts",
x = "Words"
Count sentiment by rating
tidy_review %>%
inner_join(get_sentiments("bing")) %>%
count(stars, sentiment)
## Joining, by = "word"
## # A tibble: 10 x 3
## stars sentiment n
## <dbl> <chr> <int>
## 1 1 negative 381
## 2 1 positive 241
## 3 2 negative 384
## 4 2 positive 247
## 5 3 negative 485
## 6 3 positive 432
## 7 4 negative 984
## 8 4 positive 973
## 9 5 negative 3705
## 10 5 positive 5083
Using spread()
tidy_review %>%
inner_join(get_sentiments("bing")) %>%
count(stars, sentiment) %>%
spread(sentiment, n)
## Joining, by = "word"
## # A tibble: 5 x 3
## stars negative positive
## <dbl> <int> <int>
## 1 1 381 241
## 2 2 384 247
## 3 3 485 432
## 4 4 984 973
## 5 5 3705 5083
Computing overall sentiment
tidy_review %>%
inner_join(get_sentiments("bing")) %>%
count(stars, sentiment) %>%
spread(sentiment, n) %>%
mutate(overall_sentiment = positive - negative)
## Joining, by = "word"
## # A tibble: 5 x 4
## stars negative positive overall_sentiment
## <dbl> <int> <int> <int>
## 1 1 381 241 -140
## 2 2 384 247 -137
## 3 3 485 432 -53
## 4 4 984 973 -11
## 5 5 3705 5083 1378
Visualize sentiment by rating
sentiment_stars <- tidy_review %>%
inner_join(get_sentiments("bing")) %>%
count(stars, sentiment) %>%
spread(sentiment, n) %>%
overall_sentiment = positive - negative,
stars = fct_reorder(as.factor(stars), overall_sentiment)
## Joining, by = "word"
ggplot(sentiment_stars, aes(x = stars, y = overall_sentiment, fill = as.factor(stars))) +
geom_col(show.legend = FALSE) +
coord_flip() +
title = "Overall Sentiment by Stars",
subtitle = "Reviews for Robotic Vacuums",
x = "Stars",
y = "Overall Sentiment"
tidy_twitter %>%
# Append the NRC sentiment dictionary
inner_join(get_sentiments("nrc")) %>%
# Count by complaint label and sentiment
count(complaint_label, sentiment) %>%
# Spread the sentiment and count columns
spread(sentiment, n)
## Joining, by = "word"
## # A tibble: 2 x 11
## complaint_label anger anticipation disgust fear joy negative positive
## <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 Complaint 559 730 439 552 372 1272 1392
## 2 Non-Complaint 597 1394 441 805 1108 1475 3023
## # … with 3 more variables: sadness <int>, surprise <int>, trust <int>
tidy_twitter %>%
# Append the afinn sentiment dictionary
inner_join(get_sentiments("afinn")) %>%
# Group by both complaint label and whether or not the user is verified
group_by(complaint_label, usr_verified) %>%
# Summarize the data with an aggregate_value = sum(value)
summarize(aggregate_value = sum(value)) %>%
# Spread the complaint_label and aggregate_value columns
spread(complaint_label, aggregate_value) %>%
mutate(overall_sentiment = Complaint + `Non-Complaint`)
## Joining, by = "word"
## `summarise()` regrouping output by 'complaint_label' (override with `.groups` argument)
## # A tibble: 2 x 4
## usr_verified Complaint `Non-Complaint` overall_sentiment
## <lgl> <dbl> <dbl> <dbl>
## 1 FALSE -1556 2348 792
## 2 TRUE -12 63 51
sentiment_twitter <- tidy_twitter %>%
# Append the bing sentiment dictionary
inner_join(get_sentiments("bing")) %>%
# Count by complaint label and sentiment
count(complaint_label, sentiment) %>%
# Spread the sentiment and count columns
spread(sentiment, n) %>%
# Compute overall_sentiment = positive - negative
mutate(overall_sentiment = positive - negative)
## Joining, by = "word"
# Create a bar plot out of overall sentiment by complaint level, colored by a complaint label factor
aes(x = complaint_label, y = overall_sentiment, fill = as.factor(complaint_label))
) +
geom_col(show.legend = FALSE) +
coord_flip() +
# Title the plot "Overall Sentiment by Complaint Type," with an "Airline Twitter Data" subtitle
title = "Overall Sentiment by Complaint Type",
subtitle = "Airline Twitter Data"
# Start with the topics output from the LDA run
## topic term beta
## 1 1 _adowaa_ 0.000003570
## 2 2 _adowaa_ 0.000040200
## 3 1 _arzar 0.000003570
## 4 2 _arzar 0.000040200
## 5 1 _austrian 0.000003570
## 6 2 _austrian 0.000405874
## 7 1 _bbbb_ 0.000003570
## 8 2 _bbbb_ 0.000040200
## 9 1 _cierratindall 0.000039300
## 10 2 _cierratindall 0.000003660
## 11 1 _confucksia 0.000003570
## 12 2 _confucksia 0.000040200
## 13 1 _for_ 0.000039300
## 14 2 _for_ 0.000003660
## 15 1 _hkhodary 0.000039300
## 16 2 _hkhodary 0.000003660
