library(tidyverse)
library(cluster)
library(vegan)
library(factoextra)
library(fpc)
library(RWeka)
library(ggdendro)
library(NbClust)
data("USArrests")
Lets scale the data
USArrests %>%
scale() -> arrest.scale
head(arrest.scale)
Murder Assault UrbanPop Rape
Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
Arizona 0.07163341 1.4788032 0.9989801 1.042878388
Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
California 0.27826823 1.2628144 1.7589234 2.067820292
Colorado 0.02571456 0.3988593 0.8608085 1.864967207
lets convert this to a distance matrix using the factoextra::get_dist()
function.
arrest.scale %>%
get_dist(upper = TRUE, diag = TRUE) -> arrest.dist
km.arrest <- kmeans(arrest.scale, centers = 3, nstart = 25)
km.arrest
K-means clustering with 3 clusters of sizes 13, 17, 20
Cluster means:
Murder Assault UrbanPop Rape
1 -0.9615407 -1.1066010 -0.9301069 -0.9667633
2 -0.4469795 -0.3465138 0.4788049 -0.2571398
3 1.0049340 1.0138274 0.1975853 0.8469650
Clustering vector:
Alabama Alaska Arizona Arkansas California
3 3 3 2 3
Colorado Connecticut Delaware Florida Georgia
3 2 2 3 3
Hawaii Idaho Illinois Indiana Iowa
2 1 3 2 1
Kansas Kentucky Louisiana Maine Maryland
2 1 3 1 3
Massachusetts Michigan Minnesota Mississippi Missouri
2 3 1 3 3
Montana Nebraska Nevada New Hampshire New Jersey
1 1 3 1 2
New Mexico New York North Carolina North Dakota Ohio
3 3 3 1 2
Oklahoma Oregon Pennsylvania Rhode Island South Carolina
2 2 2 2 3
South Dakota Tennessee Texas Utah Vermont
1 3 3 2 1
Virginia Washington West Virginia Wisconsin Wyoming
2 2 1 1 2
Within cluster sum of squares by cluster:
[1] 11.95246 19.62285 46.74796
(between_SS / total_SS = 60.0 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
centers<-as.data.frame(km.arrest$centers)
centers$cluster <- rownames(centers)
centers %>%
gather(type, value, -cluster) %>%
ggplot() +
geom_bar(aes(x = type, y = value, fill = type), position = "dodge", stat = "identity", colour = "black") +
facet_wrap(~cluster) +
theme_classic() +
theme(legend.position = "none")
So how would we describe these clusters?
We can get the exact stats from our scaled data
attr(arrest.scale,"scaled:center")
Murder Assault UrbanPop Rape
7.788 170.760 65.540 21.232
attr(arrest.scale,"scaled:scale")
Murder Assault UrbanPop Rape
4.355510 83.337661 14.474763 9.366385
Lets visualize these in ordination space.
We can use the factoextra::fviz_cluster
to illustrate how these places fall out
fviz_cluster(km.arrest, data = arrest.scale,
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal())
factoextra::fviz_cluster
while quick can be clunky and difficult to get exactly how you would like your plot displayed. Here is how you can break the data apart and plot yourself.
trythis<-stats::prcomp(arrest.scale, scale = FALSE, center = FALSE)
state_scores<-as.data.frame(scores(trythis))
state_scores$cluster <- km.arrest$cluster
state_scores$state <- rownames(state_scores)
head(state_scores)
Next we need to find which points fall on the outside of each group (i.e., hull). We do this using the chull()
command.
chull(state_scores %>% filter(cluster ==1) %>% select(PC1, PC2) )
[1] 13 5 7 3 12 11 9
grp.1 <- state_scores[state_scores$cluster == 1, ][chull(state_scores %>% filter(cluster ==1) %>% select(PC1, PC2) ), ] # hull values for cluster 1
grp.2 <- state_scores[state_scores$cluster == 2, ][chull(state_scores %>% filter(cluster ==2) %>% select(PC1, PC2) ), ] # hull values for cluster 2
grp.3 <- state_scores[state_scores$cluster == 3, ][chull(state_scores %>% filter(cluster ==3) %>% select(PC1, PC2) ), ] # hull values for cluster 3
all_hulls <- rbind(grp.1,grp.2,grp.3)
head(all_hulls)
ggplot(data = state_scores) +
geom_point(aes(x = PC1, y = PC2, color = as.factor(cluster))) +
geom_text(aes(x = PC1, y = PC2, color = as.factor(cluster), label = state)) +
geom_polygon(data = all_hulls, aes(x = PC1, y = PC2, fill = as.factor(cluster), colour = as.factor(cluster)), alpha = 0.25) +
theme_minimal()
Above we chose to seperate the data into three clusters. But how do we decide what is the appropriate number of clusters?
There are four different ways this can be done:
Cross validation. A subset of the data is used to develop the model and then it is ‘verfied’ with the rest of the data by checking the sum of squared distances to the group centroids. An average of the sum of square distances is then taken. Best number of clusters should have the lowest average squared distance.
Elbow method. Similar to a scree plot where you choose the “elbow” in the plot representing a decrease in the rate of change in variance
wss <- (nrow(arrest.scale)-1)*sum(apply(arrest.scale,2,var))
nclusters = 15
for(i in 2: nclusters){
wss[i]<-sum(kmeans(arrest.scale, centers = i, nstart = 25)$withinss)
}
scree_data<-data.frame(wss = wss, clusters = 1:nclusters)
head(scree_data)
And plot out the scree plot
ggplot(data = scree_data) +
geom_line(aes(x = clusters, y = wss)) +
geom_point(aes(x = clusters, y = wss), size = 3) +
scale_x_continuous(breaks = 1:nclusters) +
theme_classic()
The elbow in this plot is at 4, but you can see that this can be a little tricky in finding the “elbow”
fviz_nbclust(arrest.scale, kmeans,
method = "silhouette")
There is also a type of clustering, that clusters around a mediod rather than a centroid. In using, PAM clustering each cluster is represented by one of the objects in the cluster. PAM is less sensitive to outliers compared to k-means
pamkout<-fpc::pamk(arrest.scale, krange = 2:15)
pamkout
$pamobject
Medoids:
ID Murder Assault UrbanPop Rape
New Mexico 31 0.8292944 1.3708088 0.3081225 1.1603196
Nebraska 27 -0.8008247 -0.8250772 -0.2445636 -0.5052109
Clustering vector:
Alabama Alaska Arizona Arkansas California
1 1 1 2 1
Colorado Connecticut Delaware Florida Georgia
1 2 2 1 1
Hawaii Idaho Illinois Indiana Iowa
2 2 1 2 2
Kansas Kentucky Louisiana Maine Maryland
2 2 1 2 1
Massachusetts Michigan Minnesota Mississippi Missouri
2 1 2 1 1
Montana Nebraska Nevada New Hampshire New Jersey
2 2 1 2 2
New Mexico New York North Carolina North Dakota Ohio
1 1 1 2 2
Oklahoma Oregon Pennsylvania Rhode Island South Carolina
2 2 2 2 1
South Dakota Tennessee Texas Utah Vermont
2 1 1 2 2
Virginia Washington West Virginia Wisconsin Wyoming
2 2 2 2 2
Objective function:
build swap
1.441358 1.368969
Available components:
[1] "medoids" "id.med" "clustering" "objective" "isolation" "clusinfo"
[7] "silinfo" "diss" "call" "data"
$nc
[1] 2
$crit
[1] 0.0000000 0.4084890 0.3143656 0.3389904 0.3105170 0.2629987 0.2243815 0.2386072
[9] 0.2466113 0.2447023 0.2525199 0.2548601 0.2514600 0.2504588 0.2400069
medoid_data <- data.frame(pamkout$pamobject$medoids)
medoid_data$state<-rownames(medoid_data)
medoid_data %>%
gather(type, value, -state) %>%
ggplot() +
geom_bar(aes(x = type, y = value, fill = type), position = "dodge", stat = "identity", colour = "black") +
facet_wrap(~state) +
theme_classic() +
theme(legend.position = "none")
# WPM("refresh-cache")
# WPM("list-packages", "available")
# WPM("install-package", "XMeans")
xout<-XMeans(arrest.scale)
xout
XMeans
======
Requested iterations : 1
Iterations performed : 1
Splits prepared : 2
Splits performed : 0
Cutoff factor : 0.5
Percentage of splits accepted
by cutoff factor : 0 %
------
Cutoff factor : 0.5
------
Cluster centers : 2 centers
Cluster 0
1.004934034580132 1.0138273518643042 0.19758526759992892 0.8469650134151087
Cluster 1
-0.6699560230534215 -0.675884901242869 -0.1317235117332867 -0.564643342276739
Distortion: 16.966612
BIC-Value : -20.965379
xout$class_ids
Alabama Alaska Arizona Arkansas California
0 0 0 1 0
Colorado Connecticut Delaware Florida Georgia
0 1 1 0 0
Hawaii Idaho Illinois Indiana Iowa
1 1 0 1 1
Kansas Kentucky Louisiana Maine Maryland
1 1 0 1 0
Massachusetts Michigan Minnesota Mississippi Missouri
1 0 1 0 0
Montana Nebraska Nevada New Hampshire New Jersey
1 1 0 1 1
New Mexico New York North Carolina North Dakota Ohio
0 0 0 1 1
Oklahoma Oregon Pennsylvania Rhode Island South Carolina
1 1 1 1 0
South Dakota Tennessee Texas Utah Vermont
1 0 0 1 1
Virginia Washington West Virginia Wisconsin Wyoming
1 1 1 1 1
xcenters<-data.frame(cluster = 1:2,matrix(c(1.004934034580132, 1.0138273518643042,0.19758526759992892,0.8469650134151087,-0.6699560230534215, -0.675884901242869,-0.1317235117332867,-0.564643342276739), byrow = TRUE, ncol = 4))
names(xcenters)[2:5]<- colnames(arrest.scale)
xcenters %>%
gather(type, value, -cluster) %>%
ggplot() +
geom_bar(aes(x = type, y = value, fill = type), position = "dodge", stat = "identity", colour = "black") +
facet_wrap(~cluster) +
theme_classic() +
theme(legend.position = "none")
There is another type of partitioning clustering, called clara and is used primarily for large datasets. We will not be discussing in this class, but there is a function cluster::clara()
(among some others) to be used when clustering “big data”.
Two main types of hierarchical clustering
USArrests %>%
scale() %>%
dist(method = "euclidian") -> arrest.euc
arrest_clust<-hclust(arrest.euc, method = "ward.D2")
arrest_clust
Call:
hclust(d = arrest.euc, method = "ward.D2")
Cluster method : ward.D2
Distance : euclidean
Number of objects: 50
We have several options to plot our dendrograms.
plot(arrest_clust)
fviz_dend(arrest_clust,
k = 2, # two groups
cex = 0.5) # label size
ggdendrogram(arrest_clust,rotate=TRUE, size = 1)
Or extract the data and build your own plot
arrest_dendro <- as.dendrogram(arrest_clust)
dend_dendro_data <- dendro_data(arrest_dendro, type = "rectangle")
names(dend_dendro_data)
[1] "segments" "labels" "leaf_labels" "class"
head( dend_dendro_data$segments)
ggplot(data = dend_dendro_data$segments)+
geom_segment(aes(x = x, y = y, xend = xend, yend=yend), size =1, colour = "red") +
theme_void()
head( dend_dendro_data$labels)
ggplot(data = dend_dendro_data$segments)+
geom_segment(aes(x = x, y = y, xend = xend, yend=yend), size =1, colour = "red") +
geom_text(data = dend_dendro_data$labels, aes(x = x, y = y, label = label), colour = "blue", hjust = 1, angle = 90, size =3) +
coord_cartesian(ylim=c(-3,15)) +
theme_void()
Choosing the appropriate number of clusters
NbClust(arrest.scale,
distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "complete", index ="all") -> arrest.nb
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 9 proposed 2 as the best number of clusters
* 4 proposed 3 as the best number of clusters
* 6 proposed 4 as the best number of clusters
* 2 proposed 5 as the best number of clusters
* 1 proposed 8 as the best number of clusters
* 1 proposed 10 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
fviz_nbclust(arrest.nb, ggtheme = theme_minimal())
Among all indices:
===================
* 2 proposed 0 as the best number of clusters
* 1 proposed 1 as the best number of clusters
* 9 proposed 2 as the best number of clusters
* 4 proposed 3 as the best number of clusters
* 6 proposed 4 as the best number of clusters
* 2 proposed 5 as the best number of clusters
* 1 proposed 8 as the best number of clusters
* 1 proposed 10 as the best number of clusters
Conclusion
=========================
* According to the majority rule, the best number of clusters is 2 .
res.diana <- diana(USArrests, stand = TRUE)
res.diana
Merge:
[,1] [,2]
[1,] -15 -29
[2,] -13 -32
[3,] -23 -49
[4,] -14 -36
[5,] -20 -31
[6,] -16 -38
[7,] -37 -47
[8,] 1 -19
[9,] 4 -35
[10,] -41 -48
[11,] -46 -50
[12,] -12 -27
[13,] -21 -30
[14,] -24 -40
[15,] 2 -43
[16,] -17 -26
[17,] -1 -42
[18,] -10 -18
[19,] 9 6
[20,] -4 11
[21,] -11 -44
[22,] -7 -39
[23,] 8 -34
[24,] 10 -45
[25,] 5 -22
[26,] 17 18
[27,] 14 -33
[28,] 12 3
[29,] -5 -28
[30,] -3 25
[31,] 29 -6
[32,] 15 -25
[33,] 30 32
[34,] 22 13
[35,] 23 24
[36,] 19 7
[37,] 20 -8
[38,] 34 21
[39,] 28 16
[40,] 37 36
[41,] 26 27
[42,] 39 35
[43,] 33 -9
[44,] 43 31
[45,] 40 38
[46,] -2 44
[47,] 45 42
[48,] 41 46
[49,] 48 47
Order of objects:
[1] Alabama Tennessee Georgia Louisiana Mississippi
[6] South Carolina North Carolina Alaska Arizona Maryland
[11] New Mexico Michigan Illinois New York Texas
[16] Missouri Florida California Nevada Colorado
[21] Arkansas Virginia Wyoming Delaware Indiana
[26] Oklahoma Ohio Kansas Pennsylvania Oregon
[31] Washington Connecticut Rhode Island Massachusetts New Jersey
[36] Hawaii Utah Idaho Nebraska Minnesota
[41] Wisconsin Kentucky Montana Iowa New Hampshire
[46] Maine North Dakota South Dakota West Virginia Vermont
Height:
[1] 1.0340801 1.3687929 1.0408349 2.8205587 0.9771986 1.3960934 5.4623592 4.0837483
[9] 1.4720236 0.6743186 1.3385319 1.9655812 0.4336501 1.0044328 1.8355779 2.9848751
[17] 3.0534495 1.4621581 1.7044355 7.4792599 1.2084105 0.8788445 2.1852059 2.5927716
[25] 0.6281577 0.8500187 1.0553617 0.6765523 2.1808837 0.7311305 3.7810250 1.2903008
[33] 1.9665793 0.9626545 2.5032675 1.2698293 5.3463933 0.9132211 1.4571402 0.6215418
[41] 2.5227753 1.0181020 2.9549994 0.2619577 0.7930730 1.2918266 2.1082699 0.8721679
[49] 1.3129132
Divisive coefficient:
[1] 0.8530481
Available components:
[1] "order" "height" "dc" "merge" "diss" "call" "order.lab"
[8] "data"
fviz_dend(res.diana, cex = 0.5,
k = 4, # Cut in four groups
palette = "jco" # Color palette
)