library(tidyverse)
library(cluster)
library(vegan)
library(factoextra)
library(fpc)
library(RWeka)
library(ggdendro)
library(NbClust)data("USArrests")Lets scale the data
USArrests %>% 
  scale() -> arrest.scale
head(arrest.scale)               Murder   Assault   UrbanPop         Rape
Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
Arizona    0.07163341 1.4788032  0.9989801  1.042878388
Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
California 0.27826823 1.2628144  1.7589234  2.067820292
Colorado   0.02571456 0.3988593  0.8608085  1.864967207lets convert this to a distance matrix using the factoextra::get_dist() function.
arrest.scale %>% 
  get_dist(upper = TRUE, diag = TRUE) -> arrest.distkm.arrest <- kmeans(arrest.scale, centers = 3, nstart = 25)
km.arrestK-means clustering with 3 clusters of sizes 13, 17, 20
Cluster means:
      Murder    Assault   UrbanPop       Rape
1 -0.9615407 -1.1066010 -0.9301069 -0.9667633
2 -0.4469795 -0.3465138  0.4788049 -0.2571398
3  1.0049340  1.0138274  0.1975853  0.8469650
Clustering vector:
       Alabama         Alaska        Arizona       Arkansas     California 
             3              3              3              2              3 
      Colorado    Connecticut       Delaware        Florida        Georgia 
             3              2              2              3              3 
        Hawaii          Idaho       Illinois        Indiana           Iowa 
             2              1              3              2              1 
        Kansas       Kentucky      Louisiana          Maine       Maryland 
             2              1              3              1              3 
 Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
             2              3              1              3              3 
       Montana       Nebraska         Nevada  New Hampshire     New Jersey 
             1              1              3              1              2 
    New Mexico       New York North Carolina   North Dakota           Ohio 
             3              3              3              1              2 
      Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
             2              2              2              2              3 
  South Dakota      Tennessee          Texas           Utah        Vermont 
             1              3              3              2              1 
      Virginia     Washington  West Virginia      Wisconsin        Wyoming 
             2              2              1              1              2 
Within cluster sum of squares by cluster:
[1] 11.95246 19.62285 46.74796
 (between_SS / total_SS =  60.0 %)
Available components:
[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      centers<-as.data.frame(km.arrest$centers)
centers$cluster <- rownames(centers)
centers %>% 
  gather(type, value, -cluster) %>% 
  ggplot() + 
  geom_bar(aes(x = type, y = value, fill = type), position = "dodge", stat = "identity", colour = "black") +
  facet_wrap(~cluster) +
  theme_classic() +
  theme(legend.position = "none")So how would we describe these clusters?
We can get the exact stats from our scaled data
attr(arrest.scale,"scaled:center")  Murder  Assault UrbanPop     Rape 
   7.788  170.760   65.540   21.232 attr(arrest.scale,"scaled:scale")   Murder   Assault  UrbanPop      Rape 
 4.355510 83.337661 14.474763  9.366385 Lets visualize these in ordination space.
We can use the factoextra::fviz_cluster to illustrate how these places fall out
fviz_cluster(km.arrest, data = arrest.scale,
             ellipse.type = "convex",
             palette = "jco",
             ggtheme = theme_minimal())factoextra::fviz_cluster while quick can be clunky and difficult to get exactly how you would like your plot displayed. Here is how you can break the data apart and plot yourself.
trythis<-stats::prcomp(arrest.scale, scale = FALSE, center = FALSE)
state_scores<-as.data.frame(scores(trythis))
state_scores$cluster <- km.arrest$cluster
state_scores$state <- rownames(state_scores)
head(state_scores)Next we need to find which points fall on the outside of each group (i.e., hull). We do this using the chull() command.
chull(state_scores %>% filter(cluster ==1) %>% select(PC1, PC2) )[1] 13  5  7  3 12 11  9grp.1 <- state_scores[state_scores$cluster == 1, ][chull(state_scores %>% filter(cluster ==1) %>% select(PC1, PC2) ), ]  # hull values for cluster 1
grp.2 <- state_scores[state_scores$cluster == 2, ][chull(state_scores %>% filter(cluster ==2) %>% select(PC1, PC2) ), ]  # hull values for cluster 2
grp.3 <- state_scores[state_scores$cluster == 3, ][chull(state_scores %>% filter(cluster ==3) %>% select(PC1, PC2) ), ]  # hull values for cluster 3
all_hulls <- rbind(grp.1,grp.2,grp.3)
head(all_hulls)ggplot(data = state_scores) + 
  geom_point(aes(x = PC1, y = PC2, color = as.factor(cluster))) +
  geom_text(aes(x = PC1, y = PC2, color = as.factor(cluster), label = state))  +
  geom_polygon(data = all_hulls, aes(x = PC1, y = PC2, fill = as.factor(cluster), colour =  as.factor(cluster)), alpha = 0.25) + 
  theme_minimal() Above we chose to seperate the data into three clusters. But how do we decide what is the appropriate number of clusters?
There are four different ways this can be done:
Cross validation. A subset of the data is used to develop the model and then it is ‘verfied’ with the rest of the data by checking the sum of squared distances to the group centroids. An average of the sum of square distances is then taken. Best number of clusters should have the lowest average squared distance.
Elbow method. Similar to a scree plot where you choose the “elbow” in the plot representing a decrease in the rate of change in variance
wss <- (nrow(arrest.scale)-1)*sum(apply(arrest.scale,2,var))
nclusters = 15
for(i in 2: nclusters){
  wss[i]<-sum(kmeans(arrest.scale, centers = i, nstart = 25)$withinss)
  
}
scree_data<-data.frame(wss = wss, clusters = 1:nclusters)
head(scree_data)And plot out the scree plot
ggplot(data = scree_data) + 
  geom_line(aes(x = clusters, y = wss)) +
  geom_point(aes(x = clusters, y = wss), size = 3) +
  scale_x_continuous(breaks = 1:nclusters) +
  theme_classic()The elbow in this plot is at 4, but you can see that this can be a little tricky in finding the “elbow”
fviz_nbclust(arrest.scale, kmeans,
             method = "silhouette")There is also a type of clustering, that clusters around a mediod rather than a centroid. In using, PAM clustering each cluster is represented by one of the objects in the cluster. PAM is less sensitive to outliers compared to k-means
pamkout<-fpc::pamk(arrest.scale, krange = 2:15)
pamkout$pamobject
Medoids:
           ID     Murder    Assault   UrbanPop       Rape
New Mexico 31  0.8292944  1.3708088  0.3081225  1.1603196
Nebraska   27 -0.8008247 -0.8250772 -0.2445636 -0.5052109
Clustering vector:
       Alabama         Alaska        Arizona       Arkansas     California 
             1              1              1              2              1 
      Colorado    Connecticut       Delaware        Florida        Georgia 
             1              2              2              1              1 
        Hawaii          Idaho       Illinois        Indiana           Iowa 
             2              2              1              2              2 
        Kansas       Kentucky      Louisiana          Maine       Maryland 
             2              2              1              2              1 
 Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
             2              1              2              1              1 
       Montana       Nebraska         Nevada  New Hampshire     New Jersey 
             2              2              1              2              2 
    New Mexico       New York North Carolina   North Dakota           Ohio 
             1              1              1              2              2 
      Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
             2              2              2              2              1 
  South Dakota      Tennessee          Texas           Utah        Vermont 
             2              1              1              2              2 
      Virginia     Washington  West Virginia      Wisconsin        Wyoming 
             2              2              2              2              2 
Objective function:
   build     swap 
1.441358 1.368969 
Available components:
 [1] "medoids"    "id.med"     "clustering" "objective"  "isolation"  "clusinfo"  
 [7] "silinfo"    "diss"       "call"       "data"      
$nc
[1] 2
$crit
 [1] 0.0000000 0.4084890 0.3143656 0.3389904 0.3105170 0.2629987 0.2243815 0.2386072
 [9] 0.2466113 0.2447023 0.2525199 0.2548601 0.2514600 0.2504588 0.2400069medoid_data <- data.frame(pamkout$pamobject$medoids)
medoid_data$state<-rownames(medoid_data)
medoid_data %>% 
  gather(type, value, -state) %>% 
  ggplot() + 
  geom_bar(aes(x = type, y = value, fill = type), position = "dodge", stat = "identity", colour = "black") +
  facet_wrap(~state) +
  theme_classic() +
  theme(legend.position = "none")# WPM("refresh-cache")
# WPM("list-packages", "available")
# WPM("install-package", "XMeans")
xout<-XMeans(arrest.scale)
xout
XMeans
======
Requested iterations            : 1
Iterations performed            : 1
Splits prepared                 : 2
Splits performed                : 0
Cutoff factor                   : 0.5
Percentage of splits accepted 
by cutoff factor                : 0 %
------
Cutoff factor                   : 0.5
------
Cluster centers                 : 2 centers
Cluster 0
            1.004934034580132 1.0138273518643042 0.19758526759992892 0.8469650134151087
Cluster 1
            -0.6699560230534215 -0.675884901242869 -0.1317235117332867 -0.564643342276739
Distortion: 16.966612
BIC-Value : -20.965379xout$class_ids       Alabama         Alaska        Arizona       Arkansas     California 
             0              0              0              1              0 
      Colorado    Connecticut       Delaware        Florida        Georgia 
             0              1              1              0              0 
        Hawaii          Idaho       Illinois        Indiana           Iowa 
             1              1              0              1              1 
        Kansas       Kentucky      Louisiana          Maine       Maryland 
             1              1              0              1              0 
 Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
             1              0              1              0              0 
       Montana       Nebraska         Nevada  New Hampshire     New Jersey 
             1              1              0              1              1 
    New Mexico       New York North Carolina   North Dakota           Ohio 
             0              0              0              1              1 
      Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
             1              1              1              1              0 
  South Dakota      Tennessee          Texas           Utah        Vermont 
             1              0              0              1              1 
      Virginia     Washington  West Virginia      Wisconsin        Wyoming 
             1              1              1              1              1 xcenters<-data.frame(cluster = 1:2,matrix(c(1.004934034580132, 1.0138273518643042,0.19758526759992892,0.8469650134151087,-0.6699560230534215, -0.675884901242869,-0.1317235117332867,-0.564643342276739), byrow = TRUE, ncol = 4))
names(xcenters)[2:5]<- colnames(arrest.scale)
xcenters %>% 
  gather(type, value, -cluster) %>% 
  ggplot() + 
  geom_bar(aes(x = type, y = value, fill = type), position = "dodge", stat = "identity", colour = "black") +
  facet_wrap(~cluster) +
  theme_classic() +
  theme(legend.position = "none")There is another type of partitioning clustering, called clara and is used primarily for large datasets. We will not be discussing in this class, but there is a function cluster::clara() (among some others) to be used when clustering “big data”.
Two main types of hierarchical clustering
USArrests %>% 
  scale() %>% 
  dist(method = "euclidian") -> arrest.euc
arrest_clust<-hclust(arrest.euc, method = "ward.D2")
arrest_clust
Call:
hclust(d = arrest.euc, method = "ward.D2")
Cluster method   : ward.D2 
Distance         : euclidean 
Number of objects: 50 We have several options to plot our dendrograms.
plot(arrest_clust)fviz_dend(arrest_clust, 
          k = 2, # two groups
          cex = 0.5) # label sizeggdendrogram(arrest_clust,rotate=TRUE, size = 1)Or extract the data and build your own plot
arrest_dendro  <- as.dendrogram(arrest_clust)
dend_dendro_data <- dendro_data(arrest_dendro, type = "rectangle")
names(dend_dendro_data)[1] "segments"    "labels"      "leaf_labels" "class"      head( dend_dendro_data$segments)ggplot(data = dend_dendro_data$segments)+
  geom_segment(aes(x = x, y = y, xend = xend, yend=yend), size =1, colour = "red") + 
  theme_void()head( dend_dendro_data$labels)ggplot(data = dend_dendro_data$segments)+
  geom_segment(aes(x = x, y = y, xend = xend, yend=yend), size =1, colour = "red") + 
  geom_text(data = dend_dendro_data$labels, aes(x = x, y = y, label = label), colour = "blue", hjust = 1, angle = 90, size =3) +
  coord_cartesian(ylim=c(-3,15)) +
  theme_void()Choosing the appropriate number of clusters
NbClust(arrest.scale,
        distance = "euclidean",
        min.nc = 2, max.nc = 10,
        method = "complete", index ="all") -> arrest.nb*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 *** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 9 proposed 2 as the best number of clusters 
* 4 proposed 3 as the best number of clusters 
* 6 proposed 4 as the best number of clusters 
* 2 proposed 5 as the best number of clusters 
* 1 proposed 8 as the best number of clusters 
* 1 proposed 10 as the best number of clusters 
                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
******************************************************************* fviz_nbclust(arrest.nb, ggtheme = theme_minimal())Among all indices: 
===================
* 2 proposed  0 as the best number of clusters
* 1 proposed  1 as the best number of clusters
* 9 proposed  2 as the best number of clusters
* 4 proposed  3 as the best number of clusters
* 6 proposed  4 as the best number of clusters
* 2 proposed  5 as the best number of clusters
* 1 proposed  8 as the best number of clusters
* 1 proposed  10 as the best number of clusters
Conclusion
=========================
* According to the majority rule, the best number of clusters is  2 .res.diana <- diana(USArrests, stand = TRUE)
res.dianaMerge:
      [,1] [,2]
 [1,]  -15  -29
 [2,]  -13  -32
 [3,]  -23  -49
 [4,]  -14  -36
 [5,]  -20  -31
 [6,]  -16  -38
 [7,]  -37  -47
 [8,]    1  -19
 [9,]    4  -35
[10,]  -41  -48
[11,]  -46  -50
[12,]  -12  -27
[13,]  -21  -30
[14,]  -24  -40
[15,]    2  -43
[16,]  -17  -26
[17,]   -1  -42
[18,]  -10  -18
[19,]    9    6
[20,]   -4   11
[21,]  -11  -44
[22,]   -7  -39
[23,]    8  -34
[24,]   10  -45
[25,]    5  -22
[26,]   17   18
[27,]   14  -33
[28,]   12    3
[29,]   -5  -28
[30,]   -3   25
[31,]   29   -6
[32,]   15  -25
[33,]   30   32
[34,]   22   13
[35,]   23   24
[36,]   19    7
[37,]   20   -8
[38,]   34   21
[39,]   28   16
[40,]   37   36
[41,]   26   27
[42,]   39   35
[43,]   33   -9
[44,]   43   31
[45,]   40   38
[46,]   -2   44
[47,]   45   42
[48,]   41   46
[49,]   48   47
Order of objects:
 [1] Alabama        Tennessee      Georgia        Louisiana      Mississippi   
 [6] South Carolina North Carolina Alaska         Arizona        Maryland      
[11] New Mexico     Michigan       Illinois       New York       Texas         
[16] Missouri       Florida        California     Nevada         Colorado      
[21] Arkansas       Virginia       Wyoming        Delaware       Indiana       
[26] Oklahoma       Ohio           Kansas         Pennsylvania   Oregon        
[31] Washington     Connecticut    Rhode Island   Massachusetts  New Jersey    
[36] Hawaii         Utah           Idaho          Nebraska       Minnesota     
[41] Wisconsin      Kentucky       Montana        Iowa           New Hampshire 
[46] Maine          North Dakota   South Dakota   West Virginia  Vermont       
Height:
 [1] 1.0340801 1.3687929 1.0408349 2.8205587 0.9771986 1.3960934 5.4623592 4.0837483
 [9] 1.4720236 0.6743186 1.3385319 1.9655812 0.4336501 1.0044328 1.8355779 2.9848751
[17] 3.0534495 1.4621581 1.7044355 7.4792599 1.2084105 0.8788445 2.1852059 2.5927716
[25] 0.6281577 0.8500187 1.0553617 0.6765523 2.1808837 0.7311305 3.7810250 1.2903008
[33] 1.9665793 0.9626545 2.5032675 1.2698293 5.3463933 0.9132211 1.4571402 0.6215418
[41] 2.5227753 1.0181020 2.9549994 0.2619577 0.7930730 1.2918266 2.1082699 0.8721679
[49] 1.3129132
Divisive coefficient:
[1] 0.8530481
Available components:
[1] "order"     "height"    "dc"        "merge"     "diss"      "call"      "order.lab"
[8] "data"     fviz_dend(res.diana, cex = 0.5,
          k = 4, # Cut in four groups
          palette = "jco" # Color palette
          )