R:除了nstart和iter.max的不同设置之外,k-means中的相同簇

时间:2017-03-10 18:59:52

标签: r cluster-analysis k-means

为什么我使用iter.max的{​​{1}}和nstart的(非常)不同的设置来获得相同的群集?

kmeans()

我的真正目标是通过比较一次迭代的聚类与第2,3或10次迭代的聚类来可视化k均值聚类(用于教育目的)的收敛。这就是我加入set.seed(1) ff_1 <- kmeans(faithful, 2, iter.max = 1, nstart = 1) set.seed(1) ff_2 <- kmeans(faithful, 2, iter.max = 2, nstart = 1) set.seed(1) ff_300 <- kmeans(faithful, 2, iter.max = 300, nstart = 300) identical(ff_1, ff_2) # TRUE identical(ff_1, ff_300) # TRUE 行的原因。

1 个答案:

答案 0 :(得分:2)

kmeans的初始质心是随机选择的,因为

(1)你在所有情况下都选择了相同的随机种子= 1(这会强制为所有情况选择完全相同的质心)和

(2)群集是完全可分的,你在案例中得到相同的结果(在第一次迭代后收敛发生得非常快)。

下图显示了它。

library(grid)
library(gridExtra)
library(ggplot2)

set.seed(1)
ff_1 <- kmeans(faithful, 2, iter.max = 1, nstart = 1)

set.seed(1)
ff_2 <- kmeans(faithful, 2, iter.max = 2, nstart = 1)

set.seed(1)
ff_300 <- kmeans(faithful, 2, iter.max = 300, nstart = 300)

grid.arrange(
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_1$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_1$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 1\n", color = "ff1 cluster\n"), 
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_2$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_2$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 1\n", color = "ff2 cluster\n"),
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_300$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_300$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 1\n", color = "ff300 cluster\n"))

identical(ff_1, ff_2) # TRUE
identical(ff_1, ff_300) # TRUE

enter image description here

现在,让我们改变种子,迫使kmeans选择不同的初始质心,结果会有所不同,如下图所示。

set.seed(1)
ff_1 <- kmeans(faithful, 2, iter.max = 1, nstart = 1)

set.seed(12)
ff_2 <- kmeans(faithful, 2, iter.max = 2, nstart = 1)

set.seed(123)
ff_300 <- kmeans(faithful, 2, iter.max = 300, nstart = 300)

grid.arrange(
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_1$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_1$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 1\n", color = "ff1 cluster\n"), 
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_2$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_2$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 12\n", color = "ff2 cluster\n"),
  ggplot(faithful, aes(eruptions, waiting, col=as.factor(ff_300$cluster))) + geom_point() +
    geom_point(data=as.data.frame(ff_300$centers), aes(eruptions, waiting), col='black', pch='*', cex=15) +
    labs(title = "kmeans seed 123\n", color = "ff300 cluster\n"))

identical(ff_1, ff_2) # FALSE
identical(ff_1, ff_300) # FALSE

enter image description here