我得到了一个data.frame
,这是应用NbClust
的结果。
structure(list(group = c("group 1", "group 2", "group 3"), cl_no = list(
structure(c(6, 5.5585, 3, 1948.797, 4, 242.1568, 3, 54.1407,
6, 1840.5631, 6, 9.79002146656865e+22, 4, 90450081623.4205,
4, 156864.9512, 4, 10.6283, 4, -0.8923, 4, 0.302, 7, 1.333,
3, 0.2803, 3, 0.8199, 3, 247.2649, 3, 0.5296, 6, 0.2888,
4, 229137.149, 3, 0.5548, 2, NA, 3, 1.0779, 6, 0.0198, 0,
0, 3, 0.0878, 0, 0, 7, 0.3994), .Dim = c(2L, 26L), .Dimnames = list(
c("Number_clusters", "Value_Index"), c("KL", "CH", "Hartigan",
"CCC", "Scott", "Marriot", "TrCovW", "TraceW", "Friedman",
"Rubin", "Cindex", "DB", "Silhouette", "Duda", "PseudoT2",
"Beale", "Ratkowsky", "Ball", "PtBiserial", "Frey", "McClain",
"Dunn", "Hubert", "SDindex", "Dindex", "SDbw"))), structure(c(5,
2.0578, 3, 1802.9906, 6, 123.5253, 5, 57.783, 4, 1566.4832,
6, 4.34800291791868e+22, 4, 48666985724.7935, 4, 76377.8659,
4, 14.0144, 6, -0.8768, 7, 0.276, 3, 1.2218, 3, 0.3522, 3,
1.7337, 3, -525.5972, 3, -1.0202, 3, 0.3427, 4, 208314.4788,
3, 0.6146, 2, NA, 3, 0.9593, 6, 0.0272, 0, 0, 5, 0.0762,
0, 0, 4, 0.4535), .Dim = c(2L, 26L), .Dimnames = list(c("Number_clusters",
"Value_Index"), c("KL", "CH", "Hartigan", "CCC", "Scott",
"Marriot", "TrCovW", "TraceW", "Friedman", "Rubin", "Cindex",
"DB", "Silhouette", "Duda", "PseudoT2", "Beale", "Ratkowsky",
"Ball", "PtBiserial", "Frey", "McClain", "Dunn", "Hubert",
"SDindex", "Dindex", "SDbw"))), structure(c(5, 6.9238, 3,
1916.4988, 4, 266.3208, 4, 55.0186, 4, 1588.8823, 6, 7.0269945394771e+22,
4, 106432779026.732, 4, 171716.4478, 4, 8.8217, 4, -0.9068,
6, 0.2616, 6, 1.3002, 3, 0.2756, 3, 1.5454, 3, -532.1761,
3, -0.8509, 6, 0.2911, 4, 243096.5673, 3, 0.5354, 2, NA,
3, 1.0995, 4, 0.0138, 0, 0, 3, 0.087, 0, 0, 7, 0.5195), .Dim = c(2L,
26L), .Dimnames = list(c("Number_clusters", "Value_Index"
), c("KL", "CH", "Hartigan", "CCC", "Scott", "Marriot", "TrCovW",
"TraceW", "Friedman", "Rubin", "Cindex", "DB", "Silhouette",
"Duda", "PseudoT2", "Beale", "Ratkowsky", "Ball", "PtBiserial",
"Frey", "McClain", "Dunn", "Hubert", "SDindex", "Dindex",
"SDbw"))))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-3L), .Names = c("group", "cl_no"))
我想要实现的是获得最佳的聚类数,将其作为使用不同指标获得的最常见数。因此,我尝试了以下操作,但是它为每个簇数返回了数字3,因此,n()
可能会计算df
中的行。
df %>% mutate(cl_no = map(cl_no, ~t(.x) %>% as.data.frame() %>%
group_by(Number_clusters) %>% summarise(n = n())))
我在做什么错?我期望的输出如下:
structure(list(group = c("group 1", "group 2", "group 3"), cl_no = c(3,
3, 4)), .Names = c("group", "cl_no"), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -3L))
答案 0 :(得分:1)
我们可以使用table
library(tidyverse)
library(magrittr)
df %>%
mutate(cl_no = map_dbl(cl_no, ~ .x %>%
extract(1, ) %>%
table %>%
which.max))
# A tibble: 3 x 2
# group cl_no
# <chr> <dbl>
#1 group 1 3
#2 group 2 3
#3 group 3 4
或者如果值不同
df %>%
mutate(cl_no = map_dbl(cl_no, ~ .x %>%
extract(1, ) %>%
table %>%
{which(.== max(.)) %>%
names %>%
as.numeric}))
或者按“ group”进行split
,然后在summarise
元素内进行list
df %>%
split(.$group) %>%
map_dbl(., ~ .x$cl_no[[1]] %>%
t %>%
as_tibble %>%
group_by(Number_clusters) %>%
summarise(n = n()) %>%
slice(which.max(n)) %>%
pull(Number_clusters)) %>%
transmute(df, group, cl_no = .)