我正在使用一个名为productQuality1.1的CSV数据集,其中包含5列,其中Median是我用来确定聚类结果的产品质量绩效。我已经发现最好的k聚类数是2。如何获得数据的聚类结果?我在下面粘贴了我的数据的内容:
structure(list(weld.type.ID = 1:33, weld.type = structure(c(29L,
11L, 16L, 4L, 28L, 17L, 19L, 5L, 24L, 27L, 21L, 32L, 12L, 20L,
26L, 25L, 3L, 7L, 13L, 22L, 33L, 1L, 9L, 10L, 18L, 15L, 31L,
8L, 23L, 2L, 14L, 6L, 30L), .Label = c("1,40,Material A", "1,40S,Material C",
"1,80,Material A", "1,STD,Material A", "1,XS,Material A", "10,10S,Material C",
"10,160,Material A", "10,40,Material A", "10,40S,Material C",
"10,80,Material A", "10,STD,Material A", "10,XS,Material A",
"13,40,Material A", "13,40S,Material C", "13,80,Material A",
"13,STD,Material A", "13,XS,Material A", "14,40,Material A",
"14,STD,Material A", "14,XS,Material A", "15,STD,Material A",
"15,XS,Material A", "2,10S,Material C", "2,160,Material A", "2,40,Material A",
"2,40S,Material C", "2,80,Material A", "2,STD,Material A", "2,XS,Material A",
"4,80,Material A", "4,STD,Material A", "6,STD,Material A", "6,XS,Material A"
), class = "factor"), alpha = c(281L, 196L, 59L, 96L, 442L, 98L,
66L, 30L, 68L, 43L, 35L, 44L, 23L, 14L, 24L, 38L, 8L, 8L, 5L,
19L, 37L, 38L, 6L, 11L, 29L, 6L, 16L, 6L, 16L, 3L, 4L, 9L, 12L
), beta = c(7194L, 4298L, 3457L, 2982L, 4280L, 3605L, 2229L,
1744L, 2234L, 1012L, 1096L, 1023L, 1461L, 1303L, 531L, 233L,
630L, 502L, 328L, 509L, 629L, 554L, 358L, 501L, 422L, 566L, 403L,
211L, 159L, 268L, 167L, 140L, 621L), Median = c(0.0375507383753025,
0.043546015959685, 0.0166888869351212, 0.0310875876067419, 0.0935470294716035,
0.0263798143584636, 0.0286213698125569, 0.0167296957822645, 0.029403369311426,
0.0404683392593359, 0.0306699148693358, 0.0409507113292405, 0.0152814823151512,
0.0103834693100336, 0.0426953962552843, 0.139335880048896, 0.0120333156133183,
0.0150573864235556, 0.0140547965388361, 0.0354001989345449, 0.0551110033888123,
0.0636987097619679, 0.0156058684578843, 0.0208640835981798, 0.0636580207464108,
0.00992440459162821, 0.0374531528739036, 0.0262100640799903,
0.0898729525910631, 0.00989157442426205, 0.0215577154517479,
0.0584418091169483, 0.0184528408043719)), class = "data.frame", row.names = c(NA,
-33L))
答案 0 :(得分:2)
我想您或多或少地知道有两个聚类,并且您想查看聚类是否可以很好地分隔Median变量。
首先我们看一下您的数据框:
summary(productQuality1.1)
weld.type.ID weld.type alpha beta
Min. : 1 1,40,Material A : 1 Min. : 3.00 Min. : 140
1st Qu.: 9 1,40S,Material C : 1 1st Qu.: 9.00 1st Qu.: 403
Median :17 1,80,Material A : 1 Median : 24.00 Median : 621
Mean :17 1,STD,Material A : 1 Mean : 54.24 Mean :1383
3rd Qu.:25 1,XS,Material A : 1 3rd Qu.: 44.00 3rd Qu.:1744
Max. :33 10,10S,Material C: 1 Max. :442.00 Max. :7194
(Other) :27
Median
Min. :0.009892
1st Qu.:0.016689
Median :0.029403
Mean :0.036686
3rd Qu.:0.042695
Max. :0.139336
您只能使用alpha和beta,因为ID,weld.type是唯一的条目(例如标识符)。我们这样做:
clus = kmeans(productQuality1.1[,c("alpha","beta")],2)
productQuality1.1$cluster = factor(clus$cluster)
请注意,我使用的Alpha和Beta值的刻度非常不同。我们可以可视化聚类:
ggplot(productQuality1.1,aes(x = alpha,y = beta,col = cluster))+ geom_point()
仅使用kmeans将这些观察结果分成2个集群并不是一件容易的事,因为其中一些具有很高的alpha / beta值。我们还可以查看您的中位数值如何分布:
ggplot(productQuality1.1,aes(x = alpha,y = beta,col = Median))+ geom_point()+ scale_color_viridis_c()
最后,我们看中值:
ggplot(productQuality1.1,aes(x = Median,col = cluster))+ geom_density()
我要说的是,第2组中的一些中位数更高,但是有些则很难分开。鉴于我们在散点图中看到的内容,可能不得不更多地考虑如何使用您拥有的alpha和beta值。