R:对具有可变长度的嵌套列表应用功能

时间:2018-07-11 14:27:22

标签: r lapply nested-lists

我有以下数据框:

df <- data.frame(id = paste0('id', sample(c(1:4),80000, replace = TRUE)), date = as.Date(rbeta(80000, 0.7, 10) * 100, origin = "2016-01-01"), 
             variant = sample(c(0:1), 80000, replace = TRUE), type = sample(paste0(LETTERS[1:3],LETTERS[1]), 80000, TRUE), code = sample(letters[1:2], 80000, TRUE), 
             level = sample(LETTERS[1:8], 80000, TRUE), number = sample(c(1:100), 80000, replace = TRUE) )

接下来,我将数据帧拆分几次,并将它们(加上原始df)合并到一个列表中:

dfs <- split(df,df$id)
df2 <- lapply(dfs, function(x) split(x,x$type))

df3 <- lapply(dfs, function(x) split(x,x$code))

df4 <- lapply(dfs, function(x) split(x,x$level))

df_all <- list(dfs,df2,df3,df4)

因此,我首先按ID拆分数据帧,然后在几种条件下对它们进行拆分:无,类型,代码和级别。 “无”表示我不会再拆分它。

我的第一个问题:有没有更快/更清洁的方法来实现这一目标?

第二个问题:如何将函数应用于此列表的每个元素?它可能与lapply有关,但是随着嵌套列表的数量变化,我不知道怎么做。因此,为了更清楚一点,我想知道如何将我的函数应用于:

df_all[[1]]$id1
df_all[[1]]$id2
df_all[[1]]$id3
df_all[[1]]$id4
df_all[[2]]$id1$AA
df_all[[2]]$id1$BA
df_all[[2]]$id1$CA
df_all[[2]]$id2$AA
etc.

我的功能如下:

func <- function(x){
x <- x %>%
  group_by(variant) %>%
  summarise(H = sum(number)) %>%
  ungroup()

1 个答案:

答案 0 :(得分:1)

如果您要做的只是对变量的不同组合进行分组并进行汇总,那么拆分组可能不是一个好主意,只需修改函数,以便您可以按如下所示输入变量的不同组合: / p>

library(dplyr)

func2 <- function(x, ...){
  group_quo = quos(...)
  x %>%
    group_by(!!!group_quo) %>%
    summarize(H = sum(number)) 
}

结果:

> func2(df, id, variant)
# A tibble: 8 x 3
# Groups:   id [?]
  id    variant      H
  <fct>   <int>  <int>
1 id1         0 500192
2 id1         1 508282
3 id2         0 505829
4 id2         1 511855
5 id3         0 502280
6 id3         1 510854
7 id4         0 502621
8 id4         1 510372

> func2(df, id, type, variant)
# A tibble: 24 x 4
# Groups:   id, type [?]
   id    type  variant      H
   <fct> <fct>   <int>  <int>
 1 id1   AA          0 167757
 2 id1   AA          1 169025
 3 id1   BA          0 166225
 4 id1   BA          1 168208
 5 id1   CA          0 166210
 6 id1   CA          1 171049
 7 id2   AA          0 169277
 8 id2   AA          1 172240
 9 id2   BA          0 168596
10 id2   BA          1 169396
# ... with 14 more rows

etc.

如果您尝试应用更复杂的内容,或者想要保留列表的层次结构,则可以尝试使用嵌套的data.frames:

library(dplyr)
library(tidyr)
library(purrr)

func <- function(x){
  x %>%
    group_by(variant) %>%
    summarize(H = sum(number)) 
}

df_nested = df %>% 
  group_by(id) %>%
  nest() %>%
  mutate(df1 = data %>% map(func),
         df2 = data %>% map(~group_by(., type) %>% nest()),
         df3 = data %>% map(~group_by(., code) %>% nest()),
         df4 = data %>% map(~group_by(., level) %>% nest())) %>%
  mutate_at(vars(df2:df4), 
            funs(map(., function(x) mutate(x, data = map(data, func)) %>% unnest))) 

结果:

> df_nested
# A tibble: 4 x 6
  id    data                  df1              df2              df3              df4              
  <fct> <list>                <list>           <list>           <list>           <list>           
1 id1   <tibble [19,963 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
2 id3   <tibble [19,946 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
3 id2   <tibble [20,114 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
4 id4   <tibble [19,977 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>

> df_nested %>% 
+   select(id, data) %>%
+   unnest()
# A tibble: 80,000 x 7
   id    date       variant type  code  level number
   <fct> <date>       <int> <fct> <fct> <fct>  <int>
 1 id1   2016-01-05       1 AA    b     H         71
 2 id1   2016-01-01       0 CA    a     G         85
 3 id1   2016-01-03       0 CA    a     E         98
 4 id1   2016-01-01       1 BA    b     E         78
 5 id1   2016-01-01       1 BA    b     G         64
 6 id1   2016-01-18       1 AA    a     E         69
 7 id1   2016-01-04       1 BA    b     E         12
 8 id1   2016-01-02       0 CA    b     B         32
 9 id1   2016-01-01       1 CA    a     B         44
10 id1   2016-01-02       0 BA    a     F         89
# ... with 79,990 more rows

> df_nested %>% 
+   select(id, df1) %>%
+   unnest()
# A tibble: 8 x 3
  id    variant      H
  <fct>   <int>  <int>
1 id1         0 500192
2 id1         1 508282
3 id3         0 502280
4 id3         1 510854
5 id2         0 505829
6 id2         1 511855
7 id4         0 502621
8 id4         1 510372