source('create_datasets.R')
library(readr)
library(dplyr)
library(ggplot2)
library(stringr)
3 main types of machine learning:
Two major goals:
dimensionality reduction
str(x)
## num [1:300, 1:2] 3.37 1.44 2.36 2.63 2.4 ...
head(x)
## [,1] [,2]
## [1,] 3.370958 1.995379
## [2,] 1.435302 2.760242
## [3,] 2.363128 2.038991
## [4,] 2.632863 2.735072
## [5,] 2.404268 1.853527
## [6,] 1.893875 1.942113
# Create the k-means model: km.out
km.out <- kmeans(x, centers = 3, nstart = 20)
# Inspect the result
summary(km.out)
## Length Class Mode
## cluster 300 -none- numeric
## centers 6 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
# Print the cluster membership component of the model
km.out$cluster
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 3 3 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 1 1 1 1 1 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 3 3 2
## [281] 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 2 3 3
# Print the km.out object
km.out
## K-means clustering with 3 clusters of sizes 150, 98, 52
##
## Cluster means:
## [,1] [,2]
## 1 -5.0556758 1.96991743
## 2 2.2171113 2.05110690
## 3 0.6642455 -0.09132968
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 3 3 3 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 1 1 1 1 1 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 3 3 2
## [281] 3 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 2 3 3
##
## Within cluster sum of squares by cluster:
## [1] 295.16925 148.64781 95.50625
## (between_SS / total_SS = 87.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Scatter plot of x
plot(x,
col = km.out$cluster,
main = "k-means with 3 clusters",
xlab = "",
ylab = "")
Process of k-means:
model selection:
determining number of clusters
# Set up 2 x 3 plotting grid
par(mfrow = c(2, 3))
# Set seed
set.seed(1)
for(i in 1:6) {
# Run kmeans() on x with three clusters and one start
km.out <- kmeans(x, centers = 3, nstart = 1)
# Plot clusters
plot(x, col = km.out$cluster,
main = km.out$tot.withinss,
xlab = "", ylab = "")
}
# Initialize total within sum of squares error: wss
wss <- 0
# For 1 to 15 cluster centers
for (i in 1:15) {
km.out <- kmeans(x, centers = i, nstart = 20)
# Save total within sum of squares to wss variable
wss[i] <- km.out$tot.withinss
}
# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
# Set k equal to the number of clusters corresponding to the elbow location
k <- 2
Data challenges:
pokemon_raw <- read_csv('data/Pokemon.csv')
head(pokemon_raw)
## # A tibble: 6 x 13
## `#` Name `Type 1` `Type 2` Total HP Attack Defense
## <int> <chr> <chr> <chr> <int> <int> <int> <int>
## 1 1 Bulbasaur Grass Poison 318 45 49 49
## 2 2 Ivysaur Grass Poison 405 60 62 63
## 3 3 Venusaur Grass Poison 525 80 82 83
## 4 3 VenusaurMega Venusaur Grass Poison 625 80 100 123
## 5 4 Charmander Fire <NA> 309 39 52 43
## 6 5 Charmeleon Fire <NA> 405 58 64 58
## # ... with 5 more variables: `Sp. Atk` <int>, `Sp. Def` <int>,
## # Speed <int>, Generation <int>, Legendary <chr>
pokemon <- pokemon_raw %>% select(6:11)
head(pokemon)
## # A tibble: 6 x 6
## HP Attack Defense `Sp. Atk` `Sp. Def` Speed
## <int> <int> <int> <int> <int> <int>
## 1 45 49 49 65 65 45
## 2 60 62 63 80 80 60
## 3 80 82 83 100 100 80
## 4 80 100 123 122 120 80
## 5 39 52 43 60 50 65
## 6 58 64 58 80 65 80
# Initialize total within sum of squares error: wss
wss <- 0
# Look over 1 to 15 possible clusters
for (i in 1:15) {
# Fit the model: km.out
km.out <- kmeans(pokemon, centers = i, nstart = 20, iter.max = 50)
# Save the within cluster sum of squares
wss[i] <- km.out$tot.withinss
}
# Produce a scree plot
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
# Select number of clusters
k <- 4
# Build model with k clusters: km.out
km.pokemon <- kmeans(pokemon, centers = k, nstart = 20, iter.max = 50)
# View the resulting model
km.pokemon
## K-means clustering with 4 clusters of sizes 114, 115, 288, 283
##
## Cluster means:
## HP Attack Defense Sp. Atk Sp. Def Speed
## 1 89.20175 121.09649 92.73684 120.45614 97.67544 100.44737
## 2 71.30435 92.91304 121.42609 63.89565 88.23478 52.36522
## 3 79.18056 81.31944 69.19097 82.01042 77.53125 80.10417
## 4 50.29682 54.03180 51.62898 47.90459 49.15548 49.74912
##
## Clustering vector:
## [1] 4 3 3 1 4 3 3 1 1 4 3 3 1 4 4 3 4 4 3 3 4 4 3 1 4 3 4 3 4 3 4 3 4 2 4
## [36] 4 3 4 4 3 4 3 4 3 4 3 4 3 4 3 3 4 2 4 3 4 3 4 3 4 3 4 3 4 1 4 4 3 4 3
## [71] 3 1 4 3 2 4 3 3 4 3 4 2 2 3 3 4 2 2 4 3 4 4 3 4 3 4 3 4 2 4 3 3 1 2 4
## [106] 3 4 2 4 3 4 3 4 2 3 2 4 4 2 4 2 3 3 3 1 4 3 4 3 4 3 3 3 3 3 3 2 1 3 4
## [141] 3 1 3 4 4 3 3 1 3 4 2 4 2 3 1 3 1 1 1 4 3 1 1 1 1 1 4 3 3 4 3 3 4 3 3
## [176] 4 3 4 3 4 3 4 4 3 4 3 4 4 4 4 3 4 3 4 4 3 1 2 4 3 2 3 4 4 3 4 4 3 3 4
## [211] 2 3 2 3 3 3 4 3 3 4 2 3 2 2 2 4 3 3 2 2 2 3 2 3 4 3 4 2 4 3 4 4 3 4 3
## [246] 2 4 3 1 3 4 2 3 3 4 4 2 4 4 4 3 3 1 1 2 4 3 1 1 1 1 1 4 3 3 1 4 3 1 1
## [281] 4 3 3 1 4 3 4 3 4 4 3 4 4 4 4 3 4 4 3 4 3 4 3 4 4 3 1 4 3 4 3 4 3 1 4
## [316] 3 4 4 4 3 4 3 4 2 4 4 4 2 4 2 4 2 2 2 4 3 3 4 3 1 3 3 3 3 3 4 3 4 3 1
## [351] 3 3 4 3 1 2 4 3 4 4 4 3 4 3 4 3 1 3 3 3 3 4 3 4 3 4 2 4 2 4 2 4 3 3 2
## [386] 4 3 1 4 2 3 3 3 1 4 4 3 1 4 3 3 4 2 2 2 4 4 2 1 1 4 2 2 1 2 2 2 1 1 1
## [421] 1 1 1 1 1 1 1 1 1 1 2 1 4 2 2 4 3 3 4 3 3 4 4 3 4 3 4 4 4 4 3 4 3 4 3
## [456] 2 2 4 2 2 2 3 4 2 3 4 3 4 3 4 3 3 4 3 4 3 1 3 3 4 3 4 4 3 4 2 4 4 4 3
## [491] 2 4 3 1 1 3 4 1 1 4 2 4 2 4 3 3 4 3 4 4 3 1 3 2 2 2 2 1 1 3 3 2 3 2 3
## [526] 3 3 1 2 2 3 3 3 3 3 3 3 2 1 1 1 1 1 1 1 1 2 3 1 1 1 1 1 1 4 3 3 4 3 3
## [561] 4 3 3 4 3 4 4 2 4 3 4 3 4 3 4 3 4 3 4 4 3 4 3 4 2 2 4 3 4 3 3 2 4 2 2
## [596] 4 3 3 2 3 4 2 3 4 4 3 4 3 4 3 3 4 4 3 4 3 3 3 4 2 4 2 3 4 2 2 2 3 1 4
## [631] 3 4 3 4 3 4 3 3 4 4 3 4 3 4 3 3 4 3 3 4 2 4 3 4 3 3 4 3 4 2 4 2 2 4 3
## [666] 3 4 3 4 4 3 4 3 1 4 3 3 4 3 3 4 3 2 4 2 4 2 2 4 3 4 2 3 2 4 3 1 4 3 1
## [701] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 2 2 4 3 3 4 3 3 4 3 4 4 3 4 4 3
## [736] 4 3 4 4 3 4 3 4 3 3 4 3 3 4 2 1 2 4 3 4 3 4 3 4 2 4 2 4 3 4 3 4 2 4 3
## [771] 3 3 3 2 4 3 1 3 4 3 4 4 4 4 2 2 2 2 4 2 4 3 1 1 1 2 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 408271.9 455965.7 871531.3 513476.6
## (between_SS / total_SS = 47.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
# Plot of Defense vs. Speed by cluster membership
plot(pokemon[, c("Defense", "Speed")],
col = km.pokemon$cluster,
main = paste("k-means clustering of Pokemon with", k, "clusters"),
xlab = "Defense", ylab = "Speed")
dist()
function
hclust()
functionhead(x)
## [,1] [,2]
## [1,] 3.370958 1.995379
## [2,] 1.435302 2.760242
## [3,] 2.363128 2.038991
## [4,] 2.632863 2.735072
## [5,] 2.404268 1.853527
## [6,] 1.893875 1.942113
# Create hierarchical clustering model: hclust.out
hclust.out <- hclust(dist(x))
# Inspect the result
summary(hclust.out)
## Length Class Mode
## merge 598 -none- numeric
## height 299 -none- numeric
## order 300 -none- numeric
## labels 0 -none- NULL
## method 1 -none- character
## call 2 -none- call
## dist.method 1 -none- character
cutree()
function in R lets you split the hierarchical clusters into set clusters by number or by distance (height)plot(hclust.out)
abline(h = 7, col = "red")
# Cut by height
cutree(hclust.out, h = 7)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2 2 1 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1
## [36] 2 2 2 2 1 1 1 1 2 2 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1 1 1 1
## [71] 1 1 1 2 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 2 2 1 1 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [176] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [211] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [246] 3 3 3 3 3 2 2 2 2 1 2 2 2 2 2 1 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 1
## [281] 1 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 1 2 2
# Cut by number of clusters
cutree(hclust.out, k = 3)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2 2 1 1 2 1 1 1 2 1 2 1 2 1 1 1 2 1
## [36] 2 2 2 2 1 1 1 1 2 2 1 1 1 2 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1 1 1 1
## [71] 1 1 1 2 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 2 2 1 1 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [176] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [211] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [246] 3 3 3 3 3 2 2 2 2 1 2 2 2 2 2 1 1 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 1 1
## [281] 1 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 1 2 2
complete
: pairwise similarty between all observations in cluster 1 and 2, uses largest of similaritiessingle
: same as above but uses the smallest of similaritiesaverage
: same as above but uses average of similaritiescentroid
: finds centroid of cluster 1 and 2, uses similarity between tow centroidscomplete
and average produce more balanced treess and are more commonly usedsingle
fuses observations in one at a time and produces more unblanced treescentroid
can create inversion where clusters are put below single values. its not used often# Cluster using complete linkage: hclust.complete
hclust.complete <- hclust(dist(x), method = "complete")
# Cluster using average linkage: hclust.average
hclust.average <- hclust(dist(x), method = "average")
# Cluster using single linkage: hclust.single
hclust.single <- hclust(dist(x), method = "single")
# Plot dendrogram of hclust.complete
plot(hclust.complete, main = "Complete")
# Plot dendrogram of hclust.average
plot(hclust.average, main = "Average")
# Plot dendrogram of hclust.single
plot(hclust.single, main = "Single")
# View column means
colMeans(pokemon)
## HP Attack Defense Sp. Atk Sp. Def Speed
## 69.25875 79.00125 73.84250 72.82000 71.90250 68.27750
# View column standard deviations
apply(pokemon, 2, sd)
## HP Attack Defense Sp. Atk Sp. Def Speed
## 25.53467 32.45737 31.18350 32.72229 27.82892 29.06047
# Scale the data
pokemon.scaled <- scale(pokemon)
# Create hierarchical clustering model: hclust.pokemon
hclust.pokemon <- hclust(dist(pokemon.scaled), method = "complete")
# Apply cutree() to hclust.pokemon: cut.pokemon
cut.pokemon <- cutree(hclust.pokemon, k = 3)
# Compare methods
table(km.pokemon$cluster, cut.pokemon)
## cut.pokemon
## 1 2 3
## 1 114 0 0
## 2 114 0 1
## 3 277 11 0
## 4 283 0 0
Example: Principle components with iris dataset
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
pr.iris <- prcomp(x = iris[-5], scale = F, center = T)
summary(pr.iris)
## Importance of components%s:
## PC1 PC2 PC3 PC4
## Standard deviation 2.0563 0.49262 0.2797 0.15439
## Proportion of Variance 0.9246 0.05307 0.0171 0.00521
## Cumulative Proportion 0.9246 0.97769 0.9948 1.00000
pr.iris
## Standard deviations (1, .., p=4):
## [1] 2.0562689 0.4926162 0.2796596 0.1543862
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## Sepal.Length 0.36138659 -0.65658877 0.58202985 0.3154872
## Sepal.Width -0.08452251 -0.73016143 -0.59791083 -0.3197231
## Petal.Length 0.85667061 0.17337266 -0.07623608 -0.4798390
## Petal.Width 0.35828920 0.07548102 -0.54583143 0.7536574
pokemon_pr <- pokemon %>% select(HP, Attack, Defense, Speed)
glimpse(pokemon_pr)
## Observations: 800
## Variables: 4
## $ HP <int> 45, 60, 80, 80, 39, 58, 78, 78, 78, 44, 59, 79, 79, 45...
## $ Attack <int> 49, 62, 82, 100, 52, 64, 84, 130, 104, 48, 63, 83, 103...
## $ Defense <int> 49, 63, 83, 123, 43, 58, 78, 111, 78, 65, 80, 100, 120...
## $ Speed <int> 45, 60, 80, 80, 65, 80, 100, 100, 100, 43, 58, 78, 78,...
summary(pokemon_pr)
## HP Attack Defense Speed
## Min. : 1.00 Min. : 5 Min. : 5.00 Min. : 5.00
## 1st Qu.: 50.00 1st Qu.: 55 1st Qu.: 50.00 1st Qu.: 45.00
## Median : 65.00 Median : 75 Median : 70.00 Median : 65.00
## Mean : 69.26 Mean : 79 Mean : 73.84 Mean : 68.28
## 3rd Qu.: 80.00 3rd Qu.:100 3rd Qu.: 90.00 3rd Qu.: 90.00
## Max. :255.00 Max. :190 Max. :230.00 Max. :180.00
pr.pokemon <- prcomp(x = pokemon_pr, scale = T, center = T)
summary(pr.pokemon)
## Importance of components%s:
## PC1 PC2 PC3 PC4
## Standard deviation 1.3721 0.9933 0.8526 0.6354
## Proportion of Variance 0.4707 0.2467 0.1817 0.1009
## Cumulative Proportion 0.4707 0.7173 0.8991 1.0000
pr.pokemon
## Standard deviations (1, .., p=4):
## [1] 1.3721424 0.9932783 0.8526020 0.6353685
##
## Rotation (n x k) = (4 x 4):
## PC1 PC2 PC3 PC4
## HP 0.5009303 -0.06463396 0.8300858 -0.2363236
## Attack 0.6301797 0.02703796 -0.1621455 0.7588487
## Defense 0.4556878 -0.61865282 -0.4521283 -0.4529871
## Speed 0.3798566 0.78253440 -0.2832778 -0.4038596
The PCA models produce additional diagnostic and output components:
center
- the column means used to center the datascale
- the column sd used to scale the datarotation
- the direction of the prin comp vetors in terms of the original features/variables. This information somehow allows you to define new data in terms of the originial principal components.x
- the value of each observation in the original dataset projected to the principal componentspr.pokemon$center
## HP Attack Defense Speed
## 69.25875 79.00125 73.84250 68.27750
pr.pokemon$scale
## HP Attack Defense Speed
## 25.53467 32.45737 31.18350 29.06047
pr.pokemon$rotation
## PC1 PC2 PC3 PC4
## HP 0.5009303 -0.06463396 0.8300858 -0.2363236
## Attack 0.6301797 0.02703796 -0.1621455 0.7588487
## Defense 0.4556878 -0.61865282 -0.4521283 -0.4529871
## Speed 0.3798566 0.78253440 -0.2832778 -0.4038596
head(pr.pokemon$x,10)
## PC1 PC2 PC3 PC4
## [1,] -1.7256845 -0.097546271 -0.05163582 0.207456766
## [2,] -0.7783645 0.001484139 0.02184005 -0.039259320
## [3,] 0.5559879 0.109293957 0.08715421 -0.325236471
## [4,] 1.4899932 -0.669275804 -0.58272583 -0.485458941
## [5,] -1.6113972 0.577730653 -0.36963560 0.142341152
## [6,] -0.5904092 0.645964030 -0.17563029 -0.179301322
## [7,] 0.7439432 0.753773847 -0.11031614 -0.465278473
## [8,] 2.1192939 0.137402684 -0.81858156 0.130820647
## [9,] 1.1322555 0.770434451 -0.21022907 0.002318739
## [10,] -1.5570505 -0.467129383 -0.29163598 -0.011297649
PCbiplot()
on stack overflow, made with ggplot2 and modified this to work a little better. I use this in most cases because its much clearer.# creating a biplot
# this does not look as nice as the one he had in the video
biplot(pr.iris)
PCbiplot(pr.iris)
# Getting proportion of variance for a scree plot
pr.var <- pr.iris$sdev^2
pve <- pr.var / sum(pr.var)
# Plot variance explained for each principal component
plot(pve,
xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0,1),
type = "b")
PCbiplot(pr.pokemon)
# Variability of each principal component: pr.var
pr.var <- pr.pokemon$sdev^2
# Variance explained by each principal component: pve
pve <- pr.var / sum(pr.var)
pve
## [1] 0.4706937 0.2466505 0.1817326 0.1009233
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
3 things need to be considered for a succesful PCA:
Importance of scaling:
mtcars
dataset has very different means and sddata(mtcars)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
round(colMeans(mtcars), 2)
## mpg cyl disp hp drat wt qsec vs am gear
## 20.09 6.19 230.72 146.69 3.60 3.22 17.85 0.44 0.41 3.69
## carb
## 2.81
round(apply(mtcars, 2, sd), 2)
## mpg cyl disp hp drat wt qsec vs am gear
## 6.03 1.79 123.94 68.56 0.53 0.98 1.79 0.50 0.50 0.74
## carb
## 1.62
pr.mtcars_no_scale <- prcomp(x = mtcars, scale = F, center = F)
pr.mtcars_scale <- prcomp(x = mtcars, scale = T, center = T)
PCbiplot(pr.mtcars_no_scale)
PCbiplot(pr.mtcars_scale)
# Mean of each variable
colMeans(pokemon_pr)
## HP Attack Defense Speed
## 69.25875 79.00125 73.84250 68.27750
# Standard deviation of each variable
apply(pokemon_pr, 2, sd)
## HP Attack Defense Speed
## 25.53467 32.45737 31.18350 29.06047
# PCA model with scaling: pr.with.scaling
pr.with.scaling <- prcomp(pokemon_pr, scale = T, center = T)
# PCA model without scaling: pr.without.scaling
pr.without.scaling <- prcomp(pokemon_pr, scale = F, center = F)
# Create biplots of both for comparison
PCbiplot(pr.without.scaling)
PCbiplot(pr.with.scaling)
url <- "http://s3.amazonaws.com/assets.datacamp.com/production/course_1903/datasets/WisconsinCancer.csv"
# Download the data: wisc.df
wisc.df <- read.csv(url)
str(wisc.df)
## 'data.frame': 569 obs. of 33 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ X : logi NA NA NA NA NA NA ...
# Convert the features of the data: wisc.data
wisc.data <- as.matrix(wisc.df[, 3:32])
str(wisc.data)
## num [1:569, 1:30] 18 20.6 19.7 11.4 20.3 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:30] "radius_mean" "texture_mean" "perimeter_mean" "area_mean" ...
head(wisc.data)
## radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## [1,] 17.99 10.38 122.80 1001.0 0.11840
## [2,] 20.57 17.77 132.90 1326.0 0.08474
## [3,] 19.69 21.25 130.00 1203.0 0.10960
## [4,] 11.42 20.38 77.58 386.1 0.14250
## [5,] 20.29 14.34 135.10 1297.0 0.10030
## [6,] 12.45 15.70 82.57 477.1 0.12780
## compactness_mean concavity_mean concave.points_mean symmetry_mean
## [1,] 0.27760 0.3001 0.14710 0.2419
## [2,] 0.07864 0.0869 0.07017 0.1812
## [3,] 0.15990 0.1974 0.12790 0.2069
## [4,] 0.28390 0.2414 0.10520 0.2597
## [5,] 0.13280 0.1980 0.10430 0.1809
## [6,] 0.17000 0.1578 0.08089 0.2087
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## [1,] 0.07871 1.0950 0.9053 8.589 153.40
## [2,] 0.05667 0.5435 0.7339 3.398 74.08
## [3,] 0.05999 0.7456 0.7869 4.585 94.03
## [4,] 0.09744 0.4956 1.1560 3.445 27.23
## [5,] 0.05883 0.7572 0.7813 5.438 94.44
## [6,] 0.07613 0.3345 0.8902 2.217 27.19
## smoothness_se compactness_se concavity_se concave.points_se
## [1,] 0.006399 0.04904 0.05373 0.01587
## [2,] 0.005225 0.01308 0.01860 0.01340
## [3,] 0.006150 0.04006 0.03832 0.02058
## [4,] 0.009110 0.07458 0.05661 0.01867
## [5,] 0.011490 0.02461 0.05688 0.01885
## [6,] 0.007510 0.03345 0.03672 0.01137
## symmetry_se fractal_dimension_se radius_worst texture_worst
## [1,] 0.03003 0.006193 25.38 17.33
## [2,] 0.01389 0.003532 24.99 23.41
## [3,] 0.02250 0.004571 23.57 25.53
## [4,] 0.05963 0.009208 14.91 26.50
## [5,] 0.01756 0.005115 22.54 16.67
## [6,] 0.02165 0.005082 15.47 23.75
## perimeter_worst area_worst smoothness_worst compactness_worst
## [1,] 184.60 2019.0 0.1622 0.6656
## [2,] 158.80 1956.0 0.1238 0.1866
## [3,] 152.50 1709.0 0.1444 0.4245
## [4,] 98.87 567.7 0.2098 0.8663
## [5,] 152.20 1575.0 0.1374 0.2050
## [6,] 103.40 741.6 0.1791 0.5249
## concavity_worst concave.points_worst symmetry_worst
## [1,] 0.7119 0.2654 0.4601
## [2,] 0.2416 0.1860 0.2750
## [3,] 0.4504 0.2430 0.3613
## [4,] 0.6869 0.2575 0.6638
## [5,] 0.4000 0.1625 0.2364
## [6,] 0.5355 0.1741 0.3985
## fractal_dimension_worst
## [1,] 0.11890
## [2,] 0.08902
## [3,] 0.08758
## [4,] 0.17300
## [5,] 0.07678
## [6,] 0.12440
# Set the row names of wisc.data
row.names(wisc.data) <- wisc.df$id
head(wisc.data)
## radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 842302 17.99 10.38 122.80 1001.0 0.11840
## 842517 20.57 17.77 132.90 1326.0 0.08474
## 84300903 19.69 21.25 130.00 1203.0 0.10960
## 84348301 11.42 20.38 77.58 386.1 0.14250
## 84358402 20.29 14.34 135.10 1297.0 0.10030
## 843786 12.45 15.70 82.57 477.1 0.12780
## compactness_mean concavity_mean concave.points_mean symmetry_mean
## 842302 0.27760 0.3001 0.14710 0.2419
## 842517 0.07864 0.0869 0.07017 0.1812
## 84300903 0.15990 0.1974 0.12790 0.2069
## 84348301 0.28390 0.2414 0.10520 0.2597
## 84358402 0.13280 0.1980 0.10430 0.1809
## 843786 0.17000 0.1578 0.08089 0.2087
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 842302 0.07871 1.0950 0.9053 8.589 153.40
## 842517 0.05667 0.5435 0.7339 3.398 74.08
## 84300903 0.05999 0.7456 0.7869 4.585 94.03
## 84348301 0.09744 0.4956 1.1560 3.445 27.23
## 84358402 0.05883 0.7572 0.7813 5.438 94.44
## 843786 0.07613 0.3345 0.8902 2.217 27.19
## smoothness_se compactness_se concavity_se concave.points_se
## 842302 0.006399 0.04904 0.05373 0.01587
## 842517 0.005225 0.01308 0.01860 0.01340
## 84300903 0.006150 0.04006 0.03832 0.02058
## 84348301 0.009110 0.07458 0.05661 0.01867
## 84358402 0.011490 0.02461 0.05688 0.01885
## 843786 0.007510 0.03345 0.03672 0.01137
## symmetry_se fractal_dimension_se radius_worst texture_worst
## 842302 0.03003 0.006193 25.38 17.33
## 842517 0.01389 0.003532 24.99 23.41
## 84300903 0.02250 0.004571 23.57 25.53
## 84348301 0.05963 0.009208 14.91 26.50
## 84358402 0.01756 0.005115 22.54 16.67
## 843786 0.02165 0.005082 15.47 23.75
## perimeter_worst area_worst smoothness_worst compactness_worst
## 842302 184.60 2019.0 0.1622 0.6656
## 842517 158.80 1956.0 0.1238 0.1866
## 84300903 152.50 1709.0 0.1444 0.4245
## 84348301 98.87 567.7 0.2098 0.8663
## 84358402 152.20 1575.0 0.1374 0.2050
## 843786 103.40 741.6 0.1791 0.5249
## concavity_worst concave.points_worst symmetry_worst
## 842302 0.7119 0.2654 0.4601
## 842517 0.2416 0.1860 0.2750
## 84300903 0.4504 0.2430 0.3613
## 84348301 0.6869 0.2575 0.6638
## 84358402 0.4000 0.1625 0.2364
## 843786 0.5355 0.1741 0.3985
## fractal_dimension_worst
## 842302 0.11890
## 842517 0.08902
## 84300903 0.08758
## 84348301 0.17300
## 84358402 0.07678
## 843786 0.12440
# Create diagnosis vector
diagnosis <- as.numeric(wisc.df$diagnosis == "M")
str(wisc.data)
## num [1:569, 1:30] 18 20.6 19.7 11.4 20.3 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:569] "842302" "842517" "84300903" "84348301" ...
## ..$ : chr [1:30] "radius_mean" "texture_mean" "perimeter_mean" "area_mean" ...
colnames(wisc.data)
## [1] "radius_mean" "texture_mean"
## [3] "perimeter_mean" "area_mean"
## [5] "smoothness_mean" "compactness_mean"
## [7] "concavity_mean" "concave.points_mean"
## [9] "symmetry_mean" "fractal_dimension_mean"
## [11] "radius_se" "texture_se"
## [13] "perimeter_se" "area_se"
## [15] "smoothness_se" "compactness_se"
## [17] "concavity_se" "concave.points_se"
## [19] "symmetry_se" "fractal_dimension_se"
## [21] "radius_worst" "texture_worst"
## [23] "perimeter_worst" "area_worst"
## [25] "smoothness_worst" "compactness_worst"
## [27] "concavity_worst" "concave.points_worst"
## [29] "symmetry_worst" "fractal_dimension_worst"
str_match(colnames(wisc.data), "_mean")
## [,1]
## [1,] "_mean"
## [2,] "_mean"
## [3,] "_mean"
## [4,] "_mean"
## [5,] "_mean"
## [6,] "_mean"
## [7,] "_mean"
## [8,] "_mean"
## [9,] "_mean"
## [10,] "_mean"
## [11,] NA
## [12,] NA
## [13,] NA
## [14,] NA
## [15,] NA
## [16,] NA
## [17,] NA
## [18,] NA
## [19,] NA
## [20,] NA
## [21,] NA
## [22,] NA
## [23,] NA
## [24,] NA
## [25,] NA
## [26,] NA
## [27,] NA
## [28,] NA
## [29,] NA
## [30,] NA
table(diagnosis)
## diagnosis
## 0 1
## 357 212
# Check column means and standard deviations
round(colMeans(wisc.data), 2)
## radius_mean texture_mean perimeter_mean
## 14.13 19.29 91.97
## area_mean smoothness_mean compactness_mean
## 654.89 0.10 0.10
## concavity_mean concave.points_mean symmetry_mean
## 0.09 0.05 0.18
## fractal_dimension_mean radius_se texture_se
## 0.06 0.41 1.22
## perimeter_se area_se smoothness_se
## 2.87 40.34 0.01
## compactness_se concavity_se concave.points_se
## 0.03 0.03 0.01
## symmetry_se fractal_dimension_se radius_worst
## 0.02 0.00 16.27
## texture_worst perimeter_worst area_worst
## 25.68 107.26 880.58
## smoothness_worst compactness_worst concavity_worst
## 0.13 0.25 0.27
## concave.points_worst symmetry_worst fractal_dimension_worst
## 0.11 0.29 0.08
round(apply(wisc.data, 2, sd), 2)
## radius_mean texture_mean perimeter_mean
## 3.52 4.30 24.30
## area_mean smoothness_mean compactness_mean
## 351.91 0.01 0.05
## concavity_mean concave.points_mean symmetry_mean
## 0.08 0.04 0.03
## fractal_dimension_mean radius_se texture_se
## 0.01 0.28 0.55
## perimeter_se area_se smoothness_se
## 2.02 45.49 0.00
## compactness_se concavity_se concave.points_se
## 0.02 0.03 0.01
## symmetry_se fractal_dimension_se radius_worst
## 0.01 0.00 4.83
## texture_worst perimeter_worst area_worst
## 6.15 33.60 569.36
## smoothness_worst compactness_worst concavity_worst
## 0.02 0.16 0.21
## concave.points_worst symmetry_worst fractal_dimension_worst
## 0.07 0.06 0.02
# Execute PCA, scaling if appropriate: wisc.pr
wisc.pr <- prcomp(wisc.data, scale = T, center = T)
# Look at summary of results
summary(wisc.pr)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion 0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion 0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
# Create a biplot of wisc.pr
PCbiplot(wisc.pr)
# Scatter plot observations by components 1 and 2
plot(wisc.pr$x[, c(1, 2)], col = (diagnosis + 1),
xlab = "PC1", ylab = "PC2")
# Repeat for components 1 and 3
plot(wisc.pr$x[, c(1, 3)], col = (diagnosis + 1),
xlab = "PC1", ylab = "PC3")
# Do additional data exploration of your choosing below (optional)
plot(wisc.pr$x[, c(2, 3)], col = (diagnosis + 1),
xlab = "PC2", ylab = "PC3")
# Set up 1 x 2 plotting grid
par(mfrow = c(1, 2))
# Calculate variability of each component
pr.var <- wisc.pr$sdev^2
# Variance explained by each principal component: pve
pve <- pr.var / sum(pr.var)
# Plot variance explained for each principal component
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
# Plot cumulative proportion of variance explained
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
ylim = c(0, 1), type = "b")
concave.points_mean
?
wisc.pr$rotation[1:10,1:2]
## PC1 PC2
## radius_mean -0.21890244 0.23385713
## texture_mean -0.10372458 0.05970609
## perimeter_mean -0.22753729 0.21518136
## area_mean -0.22099499 0.23107671
## smoothness_mean -0.14258969 -0.18611302
## compactness_mean -0.23928535 -0.15189161
## concavity_mean -0.25840048 -0.06016536
## concave.points_mean -0.26085376 0.03476750
## symmetry_mean -0.13816696 -0.19034877
## fractal_dimension_mean -0.06436335 -0.36657547
# Scale the wisc.data data: data.scaled
head(wisc.data)
## radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 842302 17.99 10.38 122.80 1001.0 0.11840
## 842517 20.57 17.77 132.90 1326.0 0.08474
## 84300903 19.69 21.25 130.00 1203.0 0.10960
## 84348301 11.42 20.38 77.58 386.1 0.14250
## 84358402 20.29 14.34 135.10 1297.0 0.10030
## 843786 12.45 15.70 82.57 477.1 0.12780
## compactness_mean concavity_mean concave.points_mean symmetry_mean
## 842302 0.27760 0.3001 0.14710 0.2419
## 842517 0.07864 0.0869 0.07017 0.1812
## 84300903 0.15990 0.1974 0.12790 0.2069
## 84348301 0.28390 0.2414 0.10520 0.2597
## 84358402 0.13280 0.1980 0.10430 0.1809
## 843786 0.17000 0.1578 0.08089 0.2087
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 842302 0.07871 1.0950 0.9053 8.589 153.40
## 842517 0.05667 0.5435 0.7339 3.398 74.08
## 84300903 0.05999 0.7456 0.7869 4.585 94.03
## 84348301 0.09744 0.4956 1.1560 3.445 27.23
## 84358402 0.05883 0.7572 0.7813 5.438 94.44
## 843786 0.07613 0.3345 0.8902 2.217 27.19
## smoothness_se compactness_se concavity_se concave.points_se
## 842302 0.006399 0.04904 0.05373 0.01587
## 842517 0.005225 0.01308 0.01860 0.01340
## 84300903 0.006150 0.04006 0.03832 0.02058
## 84348301 0.009110 0.07458 0.05661 0.01867
## 84358402 0.011490 0.02461 0.05688 0.01885
## 843786 0.007510 0.03345 0.03672 0.01137
## symmetry_se fractal_dimension_se radius_worst texture_worst
## 842302 0.03003 0.006193 25.38 17.33
## 842517 0.01389 0.003532 24.99 23.41
## 84300903 0.02250 0.004571 23.57 25.53
## 84348301 0.05963 0.009208 14.91 26.50
## 84358402 0.01756 0.005115 22.54 16.67
## 843786 0.02165 0.005082 15.47 23.75
## perimeter_worst area_worst smoothness_worst compactness_worst
## 842302 184.60 2019.0 0.1622 0.6656
## 842517 158.80 1956.0 0.1238 0.1866
## 84300903 152.50 1709.0 0.1444 0.4245
## 84348301 98.87 567.7 0.2098 0.8663
## 84358402 152.20 1575.0 0.1374 0.2050
## 843786 103.40 741.6 0.1791 0.5249
## concavity_worst concave.points_worst symmetry_worst
## 842302 0.7119 0.2654 0.4601
## 842517 0.2416 0.1860 0.2750
## 84300903 0.4504 0.2430 0.3613
## 84348301 0.6869 0.2575 0.6638
## 84358402 0.4000 0.1625 0.2364
## 843786 0.5355 0.1741 0.3985
## fractal_dimension_worst
## 842302 0.11890
## 842517 0.08902
## 84300903 0.08758
## 84348301 0.17300
## 84358402 0.07678
## 843786 0.12440
data.scaled <- scale(wisc.data)
head(data.scaled)
## radius_mean texture_mean perimeter_mean area_mean
## 842302 1.0960995 -2.0715123 1.2688173 0.9835095
## 842517 1.8282120 -0.3533215 1.6844726 1.9070303
## 84300903 1.5784992 0.4557859 1.5651260 1.5575132
## 84348301 -0.7682333 0.2535091 -0.5921661 -0.7637917
## 84358402 1.7487579 -1.1508038 1.7750113 1.8246238
## 843786 -0.4759559 -0.8346009 -0.3868077 -0.5052059
## smoothness_mean compactness_mean concavity_mean
## 842302 1.5670875 3.2806281 2.65054179
## 842517 -0.8262354 -0.4866435 -0.02382489
## 84300903 0.9413821 1.0519999 1.36227979
## 84348301 3.2806668 3.3999174 1.91421287
## 84358402 0.2801253 0.5388663 1.36980615
## 843786 2.2354545 1.2432416 0.86554001
## concave.points_mean symmetry_mean fractal_dimension_mean
## 842302 2.5302489 2.215565542 2.2537638
## 842517 0.5476623 0.001391139 -0.8678888
## 84300903 2.0354398 0.938858720 -0.3976580
## 84348301 1.4504311 2.864862154 4.9066020
## 84358402 1.4272370 -0.009552062 -0.5619555
## 843786 0.8239307 1.004517928 1.8883435
## radius_se texture_se perimeter_se area_se smoothness_se
## 842302 2.4875451 -0.5647681 2.8305403 2.4853907 -0.2138135
## 842517 0.4988157 -0.8754733 0.2630955 0.7417493 -0.6048187
## 84300903 1.2275958 -0.7793976 0.8501802 1.1802975 -0.2967439
## 84348301 0.3260865 -0.1103120 0.2863415 -0.2881246 0.6890953
## 84358402 1.2694258 -0.7895490 1.2720701 1.1893103 1.4817634
## 843786 -0.2548461 -0.5921406 -0.3210217 -0.2890039 0.1562093
## compactness_se concavity_se concave.points_se symmetry_se
## 842302 1.31570389 0.7233897 0.66023900 1.1477468
## 842517 -0.69231710 -0.4403926 0.25993335 -0.8047423
## 84300903 0.81425704 0.2128891 1.42357487 0.2368272
## 84348301 2.74186785 0.8187979 1.11402678 4.7285198
## 84358402 -0.04847723 0.8277425 1.14319885 -0.3607748
## 843786 0.44515196 0.1598845 -0.06906279 0.1340009
## fractal_dimension_se radius_worst texture_worst perimeter_worst
## 842302 0.90628565 1.8850310 -1.35809849 2.3015755
## 842517 -0.09935632 1.8043398 -0.36887865 1.5337764
## 84300903 0.29330133 1.5105411 -0.02395331 1.3462906
## 84348301 2.04571087 -0.2812170 0.13386631 -0.2497196
## 84358402 0.49888916 1.2974336 -1.46548091 1.3373627
## 843786 0.48641784 -0.1653528 -0.31356043 -0.1149083
## area_worst smoothness_worst compactness_worst concavity_worst
## 842302 1.9994782 1.3065367 2.6143647 2.1076718
## 842517 1.8888270 -0.3752817 -0.4300658 -0.1466200
## 84300903 1.4550043 0.5269438 1.0819801 0.8542223
## 84348301 -0.5495377 3.3912907 3.8899747 1.9878392
## 84358402 1.2196511 0.2203623 -0.3131190 0.6126397
## 843786 -0.2441054 2.0467119 1.7201029 1.2621327
## concave.points_worst symmetry_worst fractal_dimension_worst
## 842302 2.2940576 2.7482041 1.9353117
## 842517 1.0861286 -0.2436753 0.2809428
## 84300903 1.9532817 1.1512420 0.2012142
## 84348301 2.1738732 6.0407261 4.9306719
## 84358402 0.7286181 -0.8675896 -0.3967505
## 843786 0.9050914 1.7525273 2.2398308
# Calculate the (Euclidean) distances: data.dist
data.dist <- dist(data.scaled)
# Create a hierarchical clustering model: wisc.hclust
wisc.hclust <- hclust(data.dist, method = "complete")
plot(wisc.hclust)
# Cut tree so that it has 4 clusters: wisc.hclust.clusters
wisc.hclust.clusters <- cutree(wisc.hclust, k = 4)
# Compare cluster membership to actual diagnoses
table(wisc.hclust.clusters, diagnosis)
## diagnosis
## wisc.hclust.clusters 0 1
## 1 12 165
## 2 2 5
## 3 343 40
## 4 0 2
# count out of place observations based on cluster
# basically just summing the row mins here
sum(apply(table(wisc.hclust.clusters, diagnosis), 1, min))
## [1] 54
# Create a k-means model on wisc.data: wisc.km
head(wisc.data)
## radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 842302 17.99 10.38 122.80 1001.0 0.11840
## 842517 20.57 17.77 132.90 1326.0 0.08474
## 84300903 19.69 21.25 130.00 1203.0 0.10960
## 84348301 11.42 20.38 77.58 386.1 0.14250
## 84358402 20.29 14.34 135.10 1297.0 0.10030
## 843786 12.45 15.70 82.57 477.1 0.12780
## compactness_mean concavity_mean concave.points_mean symmetry_mean
## 842302 0.27760 0.3001 0.14710 0.2419
## 842517 0.07864 0.0869 0.07017 0.1812
## 84300903 0.15990 0.1974 0.12790 0.2069
## 84348301 0.28390 0.2414 0.10520 0.2597
## 84358402 0.13280 0.1980 0.10430 0.1809
## 843786 0.17000 0.1578 0.08089 0.2087
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 842302 0.07871 1.0950 0.9053 8.589 153.40
## 842517 0.05667 0.5435 0.7339 3.398 74.08
## 84300903 0.05999 0.7456 0.7869 4.585 94.03
## 84348301 0.09744 0.4956 1.1560 3.445 27.23
## 84358402 0.05883 0.7572 0.7813 5.438 94.44
## 843786 0.07613 0.3345 0.8902 2.217 27.19
## smoothness_se compactness_se concavity_se concave.points_se
## 842302 0.006399 0.04904 0.05373 0.01587
## 842517 0.005225 0.01308 0.01860 0.01340
## 84300903 0.006150 0.04006 0.03832 0.02058
## 84348301 0.009110 0.07458 0.05661 0.01867
## 84358402 0.011490 0.02461 0.05688 0.01885
## 843786 0.007510 0.03345 0.03672 0.01137
## symmetry_se fractal_dimension_se radius_worst texture_worst
## 842302 0.03003 0.006193 25.38 17.33
## 842517 0.01389 0.003532 24.99 23.41
## 84300903 0.02250 0.004571 23.57 25.53
## 84348301 0.05963 0.009208 14.91 26.50
## 84358402 0.01756 0.005115 22.54 16.67
## 843786 0.02165 0.005082 15.47 23.75
## perimeter_worst area_worst smoothness_worst compactness_worst
## 842302 184.60 2019.0 0.1622 0.6656
## 842517 158.80 1956.0 0.1238 0.1866
## 84300903 152.50 1709.0 0.1444 0.4245
## 84348301 98.87 567.7 0.2098 0.8663
## 84358402 152.20 1575.0 0.1374 0.2050
## 843786 103.40 741.6 0.1791 0.5249
## concavity_worst concave.points_worst symmetry_worst
## 842302 0.7119 0.2654 0.4601
## 842517 0.2416 0.1860 0.2750
## 84300903 0.4504 0.2430 0.3613
## 84348301 0.6869 0.2575 0.6638
## 84358402 0.4000 0.1625 0.2364
## 843786 0.5355 0.1741 0.3985
## fractal_dimension_worst
## 842302 0.11890
## 842517 0.08902
## 84300903 0.08758
## 84348301 0.17300
## 84358402 0.07678
## 843786 0.12440
wisc.km <- kmeans(scale(wisc.data), centers = 2, nstart = 20)
# Compare k-means to actual diagnoses
table(wisc.km$cluster, diagnosis)
## diagnosis
## 0 1
## 1 343 37
## 2 14 175
sum(apply(table(wisc.km$cluster, diagnosis), 1, min))
## [1] 51
# Compare k-means to hierarchical clustering
table(wisc.hclust.clusters, wisc.km$cluster)
##
## wisc.hclust.clusters 1 2
## 1 17 160
## 2 0 7
## 3 363 20
## 4 0 2
sum(apply(table(wisc.hclust.clusters, wisc.km$cluster), 1, min))
## [1] 37
# Create a hierarchical clustering model: wisc.pr.hclust
summary(wisc.pr)
## Importance of components%s:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion 0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion 0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
wisc.pr.hclust <- hclust(dist(wisc.pr$x[, 1:7]), method = "complete")
# Cut model into 4 clusters: wisc.pr.hclust.clusters
wisc.pr.hclust.clusters <- cutree(wisc.pr.hclust, k = 4)
# Compare to actual diagnoses
t <- table(wisc.pr.hclust.clusters, diagnosis)
t
## diagnosis
## wisc.pr.hclust.clusters 0 1
## 1 5 113
## 2 350 97
## 3 2 0
## 4 0 2
sum(apply(t, 1, min))
## [1] 102
# Compare to k-means and hierarchical
t <- table(wisc.hclust.clusters, diagnosis)
t
## diagnosis
## wisc.hclust.clusters 0 1
## 1 12 165
## 2 2 5
## 3 343 40
## 4 0 2
sum(apply(t, 1, min))
## [1] 54
t <- table(wisc.km$cluster, diagnosis)
t
## diagnosis
## 0 1
## 1 343 37
## 2 14 175
sum(apply(t, 1, min))
## [1] 51