Question

我有一个词频数据框和一些其他随机人口统计变量。我想利用两个分组变量，删除不需要的变量，然后根据分组变量对频率求和。

这里和我的差不多

df <- data.frame(user= c(1:9),
                 Group1 = c("a", "a", "a", "b", "b","b","c", "c", "c"),
                 Group2 = c("d", "e", "d", "e", "d", "e", "e", "e", "e"),
                 term1 = c(0, 1, 1, 0, 1, 1, 0, 0, 0),
                 term2 = c(1, 0, 1, 1, 0, 1, 0, 1, 1),
                 term3 = c(0, 1, 0, 0, 0, 0, 1, 1, 0))

这就是我想要的。

desired <- data.frame(Group1 = c("a", "a", "b", "b", "c", "c"),
                      Group2 = c("d", "e", "d", "e", "d", "e"),
                      term1 = c(1, 1, 1, 1, 0, 0),
                      term2 = c(2, 0, 0, 2, 0, 2),
                      term3 = c(0, 1, 0, 0, 0, 2))

我的真实框架有大约 4000 个术语列，因此在 dplyr 函数中命名每个人似乎不可行。

谢谢！

Answer 1

你可以试试aggregate + expand.grid + merge

merge(
  with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
  aggregate(. ~ Group1 + Group2, df[-1], sum),
  all = TRUE
)

给出

  Group1 Group2 term1 term2 term3
1      a      d     1     2     0
2      a      e     1     0     1
3      b      d     1     0     0
4      b      e     1     2     0
5      c      d    NA    NA    NA
6      c      e     0     2     2

如果您想将 NA 设为 0，可以尝试

> res <- merge(
  with(df, expand.grid(Group1 = unique(Group1), Group2 = unique(Group2))),
  aggregate(. ~ Group1 + Group2, df[-1], sum),
  all = TRUE
)

> replace(res, is.na(res), 0)
  Group1 Group2 term1 term2 term3
1      a      d     1     2     0
2      a      e     1     0     1
3      b      d     1     0     0
4      b      e     1     2     0
5      c      d     0     0     0
6      c      e     0     2     2

Answer 2

我们可以按'Group1、'Group2'进行分组，获取sum中'term'列的summarise，并用complete对缺失的组合进行扩展

library(dplyr)
library(tidyr)
df %>%
     group_by(Group1, Group2) %>% 
     summarise(across(starts_with('term'), sum), .groups = 'drop') %>%
     complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))

-输出

# A tibble: 6 x 5
  Group1 Group2 term1 term2 term3
  <chr>  <chr>  <dbl> <dbl> <dbl>
1 a      d          1     2     0
2 a      e          1     0     1
3 b      d          1     0     0
4 b      e          1     2     0
5 c      d          0     0     0
6 c      e          0     2     2

Answer 3

如果您不需要竞争所有变量，setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] 就足够了。否则，您可以使用包 complete 中的 tidyr（如第一个答案中使用的）来填充缺少的变量。

library(data.table)
library(tidyr)

setDT(df)[,lapply(.SD[,-1], sum),.(Group1,Group2)] %>%
    complete(Group1, Group2, fill = list(term1 = 0, term2 = 0, term3 = 0))
#> # A tibble: 6 x 5
#>   Group1 Group2 term1 term2 term3
#>   <chr>  <chr>  <dbl> <dbl> <dbl>
#> 1 a      d          1     2     0
#> 2 a      e          1     0     1
#> 3 b      d          1     0     0
#> 4 b      e          1     2     0
#> 5 c      d          0     0     0
#> 6 c      e          0     2     2

通过对 R 中的变量进行分组来计算几列总和

3 个答案: