如何迭代运行Kmeans / Random Forest并比较准确性

时间:2016-12-09 18:23:06

标签: r cluster-analysis k-means random-forest

原始数据:

head(df.num_kmeans)
  datausage     mou   revenue calldrop handset2g handset3g smartphone
1  896804.7 2854801 40830.404    27515      7930     19040      20810
2  155932.1  419109  5512.498     5247      2325      2856       3257
3  674983.3 2021183 25252.265    21068      6497     13056      14273
4  522787.2 1303221 14547.380     8865      4693      9439      10746
5  523465.7 1714641 24177.095    25441      8668     12605      14766
6  527062.3 1651303 20153.482    18219      6822     11067      12994
  rechargecount rechargesum      arpu subscribers 
1          4461      235430 197704.10      105822             
2           843       39820  34799.21       18210             
3          2944      157099 133842.38       71351             
4          2278      121697 104681.58       44975              
5          2802      144262 133190.55       75860             
6          2875      143333 119389.91       63740  

我有以下PCA数据,我正在进行Kmeans聚类:

head(pcdffinal)
        PC1         PC2       PC3       PC4         PC5        PC6
 1 -9.204228 -2.73517110 2.7975063 0.6794614 -0.84627095  0.4455297
 2  2.927245  0.05666389 0.5085896 0.1472800  0.18193152  0.1041490
 3 -4.667932 -1.98176361 2.2751862 0.5347725 -0.43314927  0.3222719
 4 -1.366505 -0.40858595 0.5005192 0.4507366 -0.54996933  0.5533013
 5 -4.689454 -2.77185636 2.4323856 0.7387788  0.49237229 -0.4817083
 6 -3.477046 -1.84904214 1.5539558 0.5463861 -0.03231143  0.2814843

opt.cluster<-3
set.seed(115)
pccomp.km <- kmeans(pcdffinal,opt.cluster,nstart=25)

head(pccomp.km$cluster)
[1] 2 1 2 2 2 2
barplot(table(pccomp.km$cluster), col="steelblue")

enter image description here

pccomp.km$tot.withinss #For total within cluster sum of squares.
[1] 13172.59

我们还可以使用图表来说明数据已被安排到的组。

par(mfrow=c(1,1))
plot(pcdffinal[,1:2],col=(pccomp.km$cluster+1),main=paste('K-Means    Clustering result with k = ', opt.cluster,sep=" "),pch=20,cex=2)
points(pccomp.km$centers, pch=15,cex=2)#plotting the centres of the cluster as black squares

enter image description here

library("factoextra")
fviz_cluster(pccomp.km, data = pcdffinal, frame.type = "convex")+ theme_minimal()

enter image description here

df.num_kmeans<-df.num
df.num_kmeans$cluster.kmeans <- pccomp.km$cluster# is a vector of cluster assignment from kmeans() added as a column to the original dataset as

保存此数据集&amp; kmeans模型供进一步使用

saveRDS(pccomp.km, "kmeans_model.RDS")
write.csv(df.num_kmeans,"dfnum_kmeans.cluster.csv")

library(cluster)
clusplot(df.num_kmeans,pccomp.km$cluster,color = TRUE,shade=TRUE,labels = 2,lines = 0)

enter image description here

library(ggfortify)
autoplot(pccomp.km, data=pcdffinal, frame=TRUE,frame.type='norm')

enter image description here

然后我运行随机森林模型和计算准确度:

dfnum.kmeans <- read.csv("dfnum_kmeans.cluster.csv")
table(dfnum.kmeans$cluster.kmeans) # size of each cluster

将cluster var转换为因子

dfnum.kmeans$cluster.kmeans <- as.factor(dfnum.kmeans$cluster.kmeans)
is.factor(dfnum.kmeans$cluster.kmeans)

使用'caret'包

创建训练和测试集(75:25拆分)
set.seed(128) # for reproducibility
inTrain_kmeans <- caret::createDataPartition(y =   dfnum.kmeans$cluster.kmeans, p = 0.75, list = FALSE) 

training_kmeans <- dfnum.kmeans[inTrain_kmeans, ] 
testing_kmeans <- dfnum.kmeans[-inTrain_kmeans, ] 

set.seed(122)
control <- trainControl(method = "repeatedcv", number = 10,allowParallel = TRUE)    
 modFit.rfcaret_kmeans <- caret::train(cluster.kmeans~ ., method = "rf",data =    training_kmeans, trControl = control, number = 25)   
 modFit.rfcaret_kmeans$finalModel
 pred.test_kmeans = predict(modFit.rfcaret_kmeans, testing_kmeans);    confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )
confusionMatrix(pred.test_kmeans, testing_kmeans$cluster.kmeans )$overall[1]

我想为Kmeans和随机森林做一系列Ks说k = 2:6每次为相应的k制作图以及保存模型和数据作为csv但是每个单独完成然后对于随机森林想要导入上面保存的csv,创建火车&amp;测试数据,运行随机森林模型,然后预测每个测试数据,最后计算混淆矩阵和准确度....从而迭代得到解决方案

需要帮助将上述代码转换为迭代,计数器从2到6。

0 个答案:

没有答案