修改

Question

我是一个r-beginner并且有一个代码，我正在尝试使用平均轮廓方法。当我尝试运行for循环时，R停止并显示错误：R for Windows GUI前端已停止工作。我使用的代码是：

data_no_demographics<-read.csv(file.choose(),row.names=1)
data_no_demographics <- na.omit(data_no_demographics)
library(cluster)
k_max<-20
sil<-rep(0,k_max)
for(i in 2:k_max){
  km.res <- kmeans(scale(data_no_demographics), centers = i, nstart = 50)
  ss <- silhouette(km.res$cluster, scale(data_no_demographics))
  sil[i] <- mean(ss[, 3])
}
plot(1:k_max, sil, type = "b", pch = 20,col="black",frame = FALSE, xlab ="Number of clusters (k)", ylab="Average silhouette width (fit of data within its cluster)", main="Optimal number of clusters")
abline(v = which.max(sil), lty = 2, col="red")

任何帮助将不胜感激。

dput(data_no_demographics[1:4,])

输出：（data_structure（list（DRY.FOOD.MEAL.PREPARATION = c（0,0,0,0），BASMATI.RICE = c（0.106274747,0.086781763,0.066892377,0.039564525），JASMINE.RICE = c（ 0.037947215,0.101672855,0.025094901,0.026354222），ITALIAN.RICE = c（0,0.006335413,0,0），ORG.SIDE.DISHES = c（0,0,0,0），COUSCOUS = c（0.014091056,0.013493856,0.009541397 ，0.006704727），QUINOA = c（0.013921964,0.020977683,0.011593311,0.006638343），FLAVOURED.RICE = c（0.322163823,0.225682349,0.378639581,0.340580191），INSTANT.RICE = c（0.063184297,0.044711557,0.092459218,0.152615507），PARBOILED.RICE = c（0.065396593,0.089652796,0.048548271,0.032295539），RTH.SPECIALTY = c（0.010371018,0.011771236,0.009418283,0.007833245），RTH.RICE = c（0.307790945， 0.286375991,0.29621422,0.336331652），OTHER.RICE = c（0.013020136,0.019484745,0.010382682,0.006737918），REGULAR.DRY.RICE = c（0.045838206,0.093059756,0.051215759,0.044344132），READY.TO.EAT = c（0,0 ，0,0）），。Name = c（＆＃34; DRY.FOOD.MEAL.PREPARATION＆＃34;，＆＃34; BASMATI.RICE＆＃34;，＆＃34; JASMINE.RICE＆＃34;，＆＃34; ITALIAN.RICE＆＃34;，＆＃34; ORG.SIDE.DISHES＆＃34;，＆＃34; COUSCOUS＆＃34;，＆＃34; QUINOA＆＃34;，＆＃34; FLAVOURED.RICE＆＃34 ;，＆＃34; INSTANT.RICE＆＃34;，＆＃34; PARBOILED.RICE＆＃34;，＆＃34; RTH.SPECIALTY＆＃34;，＆＃34; RTH.RICE＆＃34;，＆＃34; OTHER.RICE＆＃34;，＆＃34; REGULAR.DRY.RICE＆＃34;，＆＃34; READY.TO.EAT＆＃34;），row.names = c（1000L，1004L，1007L，1008L），class =＆＃34; data.frame＆＃34;）

Answer 1

我刚刚对您提供的数据进行了测试，并因使用silhouette而发现了一些警告。将silhouette(km.res$cluster, scale(data_no_demographics))更改为silhouette(km.res$cluster, dist(data_no_demographics))会删除警告。

data_no_demographics <- data.frame(
  DRY.FOOD.MEAL.PREPARATION = c(0, 0, 0, 0),
  BASMATI.RICE =c(0.106274747, 0.086781763, 0.066892377, 0.039564525), 
  JASMINE.RICE = c(0.037947215, 0.101672855, 0.025094901, 0.026354222), 
  ITALIAN.RICE = c(0, 0.006335413, 0, 0), 
  ORG.SIDE.DISHES = c(0, 0, 0, 0), 
  COUSCOUS = c(0.014091056, 0.013493856, 0.009541397, 0.006704727), 
  QUINOA = c(0.013921964, 0.020977683, 0.011593311, 0.006638343), 
  FLAVOURED.RICE = c(0.322163823, 0.225682349, 0.378639581, 0.340580191),
  INSTANT.RICE = c(0.063184297, 0.044711557, 0.092459218, 0.152615507),
  PARBOILED.RICE = c(0.065396593, 0.089652796, 0.048548271, 0.032295539),
  RTH.SPECIALTY = c(0.010371018, 0.011771236, 0.009418283, 0.007833245), 
  RTH.RICE = c(0.307790945, 0.286375991, 0.29621422, 0.336331652), 
  OTHER.RICE = c(0.013020136, 0.019484745, 0.010382682, 0.006737918), 
  REGULAR.DRY.RICE = c(0.045838206, 0.093059756, 0.051215759, 0.044344132), 
  READY.TO.EAT = c(0, 0, 0, 0), 
  row.names = c(1000L, 1004L, 1007L, 1008L)
)
# omit columns that are just zeros
data_no_demographics <- data_no_demographics[-c(1,5,15)]

library(cluster)
k_max<-3
sil<-rep(0,k_max)
distance_matrix <- dist(data_no_demographics)
for(i in 2:k_max){
  km.res <- kmeans(scale(data_no_demographics), centers = i, nstart = 1)
  ss <- silhouette(km.res$cluster, distance_matrix)
  sil[i] <- mean(ss[, 3])
}
plot(1:k_max, sil, type = "b", pch = 20,col="black",frame = FALSE, 
  xlab ="Number of clusters (k)", 
  ylab="Average silhouette width (fit of data within its cluster)", 
  main="Optimal number of clusters")
abline(v = which.max(sil), lty = 2, col="red")

修改

我只是将距离矩阵（dist）的计算移到了循环之外。如果使用更多行，这应该可以加速计算。

运行for循环时R崩溃

1 个答案:

修改