嵌套结构内的摘要-R

时间:2018-06-24 15:30:54

标签: r dplyr purrr

我得到了一个data.frame,这是应用NbClust的结果。

structure(list(group = c("group 1", "group 2", "group 3"), cl_no = list(
structure(c(6, 5.5585, 3, 1948.797, 4, 242.1568, 3, 54.1407, 
6, 1840.5631, 6, 9.79002146656865e+22, 4, 90450081623.4205, 
4, 156864.9512, 4, 10.6283, 4, -0.8923, 4, 0.302, 7, 1.333, 
3, 0.2803, 3, 0.8199, 3, 247.2649, 3, 0.5296, 6, 0.2888, 
4, 229137.149, 3, 0.5548, 2, NA, 3, 1.0779, 6, 0.0198, 0, 
0, 3, 0.0878, 0, 0, 7, 0.3994), .Dim = c(2L, 26L), .Dimnames = list(
    c("Number_clusters", "Value_Index"), c("KL", "CH", "Hartigan", 
    "CCC", "Scott", "Marriot", "TrCovW", "TraceW", "Friedman", 
    "Rubin", "Cindex", "DB", "Silhouette", "Duda", "PseudoT2", 
    "Beale", "Ratkowsky", "Ball", "PtBiserial", "Frey", "McClain", 
    "Dunn", "Hubert", "SDindex", "Dindex", "SDbw"))), structure(c(5, 
2.0578, 3, 1802.9906, 6, 123.5253, 5, 57.783, 4, 1566.4832, 
6, 4.34800291791868e+22, 4, 48666985724.7935, 4, 76377.8659, 
4, 14.0144, 6, -0.8768, 7, 0.276, 3, 1.2218, 3, 0.3522, 3, 
1.7337, 3, -525.5972, 3, -1.0202, 3, 0.3427, 4, 208314.4788, 
3, 0.6146, 2, NA, 3, 0.9593, 6, 0.0272, 0, 0, 5, 0.0762, 
0, 0, 4, 0.4535), .Dim = c(2L, 26L), .Dimnames = list(c("Number_clusters", 
"Value_Index"), c("KL", "CH", "Hartigan", "CCC", "Scott", 
"Marriot", "TrCovW", "TraceW", "Friedman", "Rubin", "Cindex", 
"DB", "Silhouette", "Duda", "PseudoT2", "Beale", "Ratkowsky", 
"Ball", "PtBiserial", "Frey", "McClain", "Dunn", "Hubert", 
"SDindex", "Dindex", "SDbw"))), structure(c(5, 6.9238, 3, 
1916.4988, 4, 266.3208, 4, 55.0186, 4, 1588.8823, 6, 7.0269945394771e+22, 
4, 106432779026.732, 4, 171716.4478, 4, 8.8217, 4, -0.9068, 
6, 0.2616, 6, 1.3002, 3, 0.2756, 3, 1.5454, 3, -532.1761, 
3, -0.8509, 6, 0.2911, 4, 243096.5673, 3, 0.5354, 2, NA, 
3, 1.0995, 4, 0.0138, 0, 0, 3, 0.087, 0, 0, 7, 0.5195), .Dim = c(2L, 
26L), .Dimnames = list(c("Number_clusters", "Value_Index"
), c("KL", "CH", "Hartigan", "CCC", "Scott", "Marriot", "TrCovW", 
"TraceW", "Friedman", "Rubin", "Cindex", "DB", "Silhouette", 
"Duda", "PseudoT2", "Beale", "Ratkowsky", "Ball", "PtBiserial", 
"Frey", "McClain", "Dunn", "Hubert", "SDindex", "Dindex", 
"SDbw"))))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-3L), .Names = c("group", "cl_no"))

我想要实现的是获得最佳的聚类数,将其作为使用不同指标获得的最常见数。因此,我尝试了以下操作,但是它为每个簇数返回了数字3,因此,n()可能会计算df中的行。

df %>% mutate(cl_no = map(cl_no, ~t(.x) %>% as.data.frame() %>% 
group_by(Number_clusters) %>% summarise(n = n())))

我在做什么错?我期望的输出如下:

structure(list(group = c("group 1", "group 2", "group 3"), cl_no = c(3, 
3, 4)), .Names = c("group", "cl_no"), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -3L))

1 个答案:

答案 0 :(得分:1)

我们可以使用table

library(tidyverse)
library(magrittr)
df %>%
   mutate(cl_no = map_dbl(cl_no, ~ .x %>% 
                                      extract(1, ) %>%
                                      table %>%
                                      which.max))
# A tibble: 3 x 2
#  group   cl_no
#  <chr>   <dbl>
#1 group 1     3
#2 group 2     3
#3 group 3     4

或者如果值不同

df %>%
   mutate(cl_no = map_dbl(cl_no, ~ .x %>% 
                                      extract(1, ) %>%
                                      table %>%
                                      {which(.== max(.)) %>%
                                           names %>% 
                                           as.numeric}))

或者按“ group”进行split,然后在summarise元素内进行list

df %>%
     split(.$group) %>%
     map_dbl(., ~ .x$cl_no[[1]] %>% 
                       t %>% 
                       as_tibble %>% 
                       group_by(Number_clusters) %>% 
                       summarise(n = n()) %>%
                       slice(which.max(n)) %>%
                       pull(Number_clusters)) %>% 
     transmute(df, group,  cl_no = .)