如何迭代地为不同的簇(k)值进行聚类

时间:2016-12-08 10:29:36

标签: r cluster-analysis

我有以下PCA数据,我正在进行Kmeans聚类:

head(pcdffinal)
        PC1         PC2       PC3       PC4         PC5        PC6
 1 -9.204228 -2.73517110 2.7975063 0.6794614 -0.84627095  0.4455297
 2  2.927245  0.05666389 0.5085896 0.1472800  0.18193152  0.1041490
 3 -4.667932 -1.98176361 2.2751862 0.5347725 -0.43314927  0.3222719
 4 -1.366505 -0.40858595 0.5005192 0.4507366 -0.54996933  0.5533013
 5 -4.689454 -2.77185636 2.4323856 0.7387788  0.49237229 -0.4817083
 6 -3.477046 -1.84904214 1.5539558 0.5463861 -0.03231143  0.2814843

opt.cluster<-3
set.seed(115)
pccomp.km <- kmeans(pcdffinal,opt.cluster,nstart=25)

head(pccomp.km$cluster)
[1] 2 1 2 2 2 2
barplot(table(pccomp.km$cluster), col="steelblue")

enter image description here

pccomp.km$tot.withinss #For total within cluster sum of squares.
[1] 13172.59

我们还可以使用图表来说明数据已被安排到的组。

par(mfrow=c(1,1))
plot(pcdffinal[,1:2],col=(pccomp.km$cluster+1),main=paste('K-Means    Clustering result with k = ', opt.cluster,sep=" "),pch=20,cex=2)
points(pccomp.km$centers, pch=15,cex=2)#plotting the centres of the cluster as black squares

enter image description here

library("factoextra")
fviz_cluster(pccomp.km, data = pcdffinal, frame.type = "convex")+ theme_minimal()

enter image description here

df.num_kmeans<-df.num
df.num_kmeans$cluster.kmeans <- pccomp.km$cluster# is a vector of cluster assignment from kmeans() added as a column to the original dataset as

保存此数据集&amp; kmeans模型供进一步使用

saveRDS(pccomp.km, "kmeans_model.RDS")
write.csv(df.num_kmeans,"dfnum_kmeans.cluster.csv")

library(cluster)
clusplot(df.num_kmeans,pccomp.km$cluster,color = TRUE,shade=TRUE,labels = 2,lines = 0)

enter image description here

library(ggfortify)
autoplot(pccomp.km, data=pcdffinal, frame=TRUE,frame.type='norm')

enter image description here

我想迭代地为Kmeans做K = 2:6的Ks,每次为相应的k制作图,以及将模型和数据保存为csv,但每个都为不同的k& #39; S

需要帮助将上述代码转换为迭代,计数器从2到6。

原始数据:

head(df.num_kmeans)
  datausage     mou   revenue calldrop handset2g handset3g smartphone
1  896804.7 2854801 40830.404    27515      7930     19040      20810
2  155932.1  419109  5512.498     5247      2325      2856       3257
3  674983.3 2021183 25252.265    21068      6497     13056      14273
4  522787.2 1303221 14547.380     8865      4693      9439      10746
5  523465.7 1714641 24177.095    25441      8668     12605      14766
6  527062.3 1651303 20153.482    18219      6822     11067      12994
  rechargecount rechargesum      arpu subscribers 
1          4461      235430 197704.10      105822             
2           843       39820  34799.21       18210             
3          2944      157099 133842.38       71351             
4          2278      121697 104681.58       44975              
5          2802      144262 133190.55       75860             
6          2875      143333 119389.91       63740     

使用随机森林进行准确性比较

dfnum.kmeans <- read.csv("dfnum_kmeans.cluster.csv")
table(dfnum.kmeans$cluster.kmeans) # size of each cluster

将cluster var转换为因子

dfnum.kmeans$cluster.kmeans <- as.factor(dfnum.kmeans$cluster.kmeans)
is.factor(dfnum.kmeans$cluster.kmeans)

使用&#39; caret&#39;创建培训和测试集(75:25分割)封装

set.seed(128) # for reproducibility
inTrain_kmeans <- caret::createDataPartition(y =   dfnum.kmeans$cluster.kmeans, p = 0.75, list = FALSE) 

training_kmeans <- dfnum.kmeans[inTrain_kmeans, ] 
testing_kmeans <- dfnum.kmeans[-inTrain_kmeans, ] 

set.seed(122)
control <- trainControl(method = "repeatedcv", number = 10,allowParallel = TRUE)    
 modFit.rfcaret_kmeans <- caret::train(cluster.kmeans~ ., method = "rf",data =    training_kmeans, trControl = control, number = 25)   
 modFit.rfcaret_kmeans$finalModel
 pred.test_kmeans = predict(modFit.rfcaret_kmeans, testing_kmeans);    confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )
confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )$overall[1]

1 个答案:

答案 0 :(得分:0)

假设您的原始数据帧是df.num,以下内容可以保存您工作目录中的所有文件(针对不同的k值):

for (k in 2:6) {
  set.seed(115)
  pccomp.km <- kmeans(pcdffinal,k,nstart=25)
  head(pccomp.km$cluster)
  print(paste(k, pccomp.km$tot.withinss)) #For total within cluster sum of squares.
  png(paste0('kmeans_proj_',k, '.png'))
  par(mfrow=c(1,1))
  plot(pcdffinal[,1:2],col=(pccomp.km$cluster+1),main=paste('K-Means    Clustering result with k = ', k,sep=" "),pch=20,cex=2)
  points(pccomp.km$centers, pch=15,cex=2)#plotting the centres of the cluster as black squares
  dev.off()
  png(paste0('kmeans_fviz_',k, '.png'))
  print(fviz_cluster(pccomp.km, data = pcdffinal, frame.type = "convex")+ theme_minimal())
  dev.off()
  df.num_kmeans<-df.num
  df.num_kmeans$cluster.kmeans <- pccomp.km$cluster# is a vector of cluster assignment from kmeans() added as a column to the original dataset as
  saveRDS(pccomp.km, paste0("kmeans_model_", k, ".RDS"))
  write.csv(df.num_kmeans,paste0("dfnum_kmeans_", k, ".cluster.csv"))
  png(paste0('clusplot_',k, '.png'))
  clusplot(df.num_kmeans,pccomp.km$cluster,color = TRUE,shade=TRUE,labels = 2,lines = 0)
  dev.off()
  png(paste0('autoplot_',k, '.png'))
  print(autoplot(pccomp.km, data=pcdffinal, frame=TRUE,frame.type='norm'))
  dev.off()
}