我尝试从不同的软件包中估算Silhouette索引。但是,从结果中可以找到一些区别(特别是clusterCrit包)。我想知道计算Silhouette索引最可靠的方法是什么,以及clusterCrit如何做到这一点(因为我无法从其来源中找到ID)。
set.seed(23123)
data<-matrix(rnorm(10000),ncol=100)
sil0<-vector(length=20)
sil1<-vector(length=20)
sil2<-vector(length=20)
sil3<-vector(length=20)
dis = dist(data, method = "euclidean")
for (k in 2:20){
clust<-kmeans(data, k, nstart=20)
# based on cluster package
silhou<-cluster::silhouette(clust$cluster,dis)
sil0[k]<-mean(silhou[,3])
# based on fpc package
statos<-fpc::cluster.stats(dis,clust$cluster)
sil1[k]<-statos$avg.silwidth
# based on NbClust package (copied the part of the NbClust()
# function that calculates this index, end of the post)
sil2[k]<-Indice.S(dis, clust$cluster)
#based on clusterCrit
criteri<-clusterCrit::intCriteria((data),clust$cluster,c("Silhouette"))
sil3[k]<-criteri$silhouette
}
plot(sil0)
lines(sil1)
lines(sil2,col='blue')
lines(sil3,col='red')
以下代码块是从NbClust函数源中提取的,并在上面使用:
Indice.S <- function (d, cl)
{
d <- as.matrix(d)
Si <- 0
for (k in 1:max(cl)) {
if ((sum(cl == k)) <= 1)
Sil <- 1
else {
Sil <- 0
for (i in 1:length(cl)) {
if (cl[i] == k) {
ai <- sum(d[i, cl == k])/(sum(cl == k) - 1)
dips <- NULL
for (j in 1:max(cl)) if (cl[i] != j)
if (sum(cl == j) != 1)
dips <- cbind(dips, c((sum(d[i, cl == j]))/(sum(cl ==
j))))
else dips <- cbind(dips, c((sum(d[i, cl ==
j]))))
bi <- min(dips)
Sil <- Sil + (bi - ai)/max(c(ai, bi))
}
}
}
Si <- Si + Sil
}
Si/length(cl)
}