Question

我尝试从不同的软件包中估算Silhouette索引。但是，从结果中可以找到一些区别（特别是clusterCrit包）。我想知道计算Silhouette索引最可靠的方法是什么，以及clusterCrit如何做到这一点（因为我无法从其来源中找到ID）。

set.seed(23123) 
data<-matrix(rnorm(10000),ncol=100)

sil0<-vector(length=20)
sil1<-vector(length=20)
sil2<-vector(length=20)
sil3<-vector(length=20)

dis = dist(data, method = "euclidean")

for (k in 2:20){
    clust<-kmeans(data, k, nstart=20)

    # based on cluster package
    silhou<-cluster::silhouette(clust$cluster,dis)
    sil0[k]<-mean(silhou[,3])


    # based on fpc package
    statos<-fpc::cluster.stats(dis,clust$cluster)
    sil1[k]<-statos$avg.silwidth

    # based on NbClust package (copied the part of the NbClust()
    # function that calculates this index, end of the post)
    sil2[k]<-Indice.S(dis, clust$cluster)

    #based on clusterCrit
        criteri<-clusterCrit::intCriteria((data),clust$cluster,c("Silhouette"))
    sil3[k]<-criteri$silhouette
} 

plot(sil0)
lines(sil1)
lines(sil2,col='blue')
lines(sil3,col='red')

以下代码块是从NbClust函数源中提取的，并在上面使用：

Indice.S <- function (d, cl) 
{
    d <- as.matrix(d)
    Si <- 0
    for (k in 1:max(cl)) {
        if ((sum(cl == k)) <= 1) 
            Sil <- 1
        else {
            Sil <- 0
            for (i in 1:length(cl)) {
                if (cl[i] == k) {
                  ai <- sum(d[i, cl == k])/(sum(cl == k) - 1)
                  dips <- NULL
                  for (j in 1:max(cl)) if (cl[i] != j) 
                    if (sum(cl == j) != 1) 
                      dips <- cbind(dips, c((sum(d[i, cl == j]))/(sum(cl == 
                        j))))
                    else dips <- cbind(dips, c((sum(d[i, cl == 
                      j]))))
                  bi <- min(dips)
                  Sil <- Sil + (bi - ai)/max(c(ai, bi))
                }
            }
        }
        Si <- Si + Sil
    }
    Si/length(cl)
}

Silhouette指数的最可靠估算？

0 个答案: