如何通过变量列表聚合数据框,保留未分组的列并指示子集?

时间:2018-03-09 12:47:05

标签: r aggregate subset

通过按多个分组变量a:c对数据帧进行分组,我想将其虚拟划分为子集。之后我想添加两列,一列包含子集大小的计数,另一列是子集的ID。

set.seed(67)

n <- 1000

df1 <- data.frame(
  a=rbinom(n, 1, .5),
  b=sample(20:40, n, replace = TRUE),
  c=sample(seq(3000, 4000, 100), n, replace = TRUE),
  d=rbinom(n, 1, .13),
  k=rbinom(n, 1, .88),
  l=rbinom(n, 1, .075),
  m=rbinom(n, 1, .05),
  n=rbinom(n, 1, .3)
)

> head(df1)
  a  b    c d k l m n
1 1 21 3900 0 1 0 0 0
2 0 26 3600 0 1 0 0 0
3 0 23 3900 0 1 0 0 0
4 1 23 3900 0 1 0 0 0
5 0 32 4000 1 1 0 0 0
6 1 23 3200 0 0 0 0 0

我已经让小组计数正确,但我需要保留其他变量。

> with(df1, aggregate(d, list(a, b, c), length))
    Group.1 Group.2 Group.3 x
1         0      20    3000 2
2         1      20    3000 3
3         0      21    3000 2
4         1      21    3000 3
5         0      22    3000 3
6         1      22    3000 1
...

当我将整个数据框定义为对象时,它还会显示计数但值会被覆盖:

> with(df1, aggregate(df1, list(a, b, c), length))
    Group.1 Group.2 Group.3 a b c d k l m n
1         0      20    3000 2 2 2 2 2 2 2 2
2         1      20    3000 3 3 3 3 3 3 3 3
3         0      21    3000 2 2 2 2 2 2 2 2
4         1      21    3000 3 3 3 3 3 3 3 3
5         0      22    3000 3 3 3 3 3 3 3 3
6         1      22    3000 1 1 1 1 1 1 1 1
...

其实我想要这样的东西:

    a  b    c d k l m n count id
847 0 20 3000 1 1 0 0 1     2  1
939 0 20 3000 0 0 0 0 0     2  1
264 1 21 3000 0 1 0 0 0     3  2
569 1 21 3000 0 1 0 0 0     3  2
876 1 21 3000 0 1 0 0 1     3  2
346 0 22 3000 0 1 0 0 1     3  3
846 0 22 3000 0 1 0 0 0     3  3
929 0 22 3000 0 1 0 0 1     3  3
...

我该怎么做?

4 个答案:

答案 0 :(得分:2)

在基地R中,您可以使用ave ...

df1 <- df1[order(df1$c,df1$b,df1$a),]
df1$id <- cumsum(!duplicated(df1[,c("a","b","c")]))
df1$count <- ave(df1$a,df1$id,FUN=length)

head(df1)
    a  b    c d k l m n id count
847 0 20 3000 1 1 0 0 1  1     2
939 0 20 3000 0 0 0 0 0  1     2
217 1 20 3000 0 1 0 0 0  2     3
458 1 20 3000 0 1 0 0 0  2     3
631 1 20 3000 0 1 0 0 0  2     3
360 0 21 3000 0 1 1 0 0  3     2

df的顺序会影响你得到的id值,但希望这并不重要。

答案 1 :(得分:1)

这是你之后的事吗?

library(tidyverse);
df1 %>%
    group_by(a, b, c) %>%
    mutate(count = n()) %>%
    ungroup() %>%
    arrange(b, c, a) %>%
    mutate(id = cumsum(!duplicated(paste0(a, b, c))));
    ## A tibble: 1,000 x 10
    #       a     b     c     d     k     l     m     n count    id
    #   <int> <int> <dbl> <int> <int> <int> <int> <int> <int> <int>
    # 1     0    20 3000.     1     1     0     0     1     2     1
    # 2     0    20 3000.     0     0     0     0     0     2     1
    # 3     1    20 3000.     0     1     0     0     0     3     2
    # 4     1    20 3000.     0     1     0     0     0     3     2
    # 5     1    20 3000.     0     1     0     0     0     3     2
    # 6     0    20 3100.     0     1     0     0     0     2     3
    # 7     0    20 3100.     0     1     1     0     0     2     3
    # 8     1    20 3100.     0     1     0     0     0     1     4
    # 9     0    20 3200.     1     1     0     0     0     3     5
    #10     0    20 3200.     0     1     0     0     0     3     5
    ## ... with 990 more rows

答案 2 :(得分:1)

以下是使用group_indices

的Maurits Evers的类似答案
library(tidyverse)
    df1 %>%
      mutate(id = group_indices(., a,b,c)) %>%  #extract the group indices when grouped by a, b and c
      group_by(a, b, c) %>% #group by a, b and c
      mutate(count = n()) %>% #get the number of elements in each group
      arrange(a, b, c) #arrange by a, b, c or how ever you preffer

#output
# A tibble: 1,000 x 10
# Groups: a, b, c [414]
       a     b     c     d     k     l     m     n    id count
   <int> <int> <dbl> <int> <int> <int> <int> <int> <int> <int>
 1     0    20  3000     1     1     0     0     1     1     2
 2     0    20  3000     0     0     0     0     0     1     2
 3     0    20  3100     0     1     0     0     0     2     2
 4     0    20  3100     0     1     1     0     0     2     2
 5     0    20  3200     1     1     0     0     0     3     3
 6     0    20  3200     0     1     0     0     0     3     3
 7     0    20  3200     0     1     0     0     0     3     3
 8     0    20  3300     1     1     0     0     1     4     2
 9     0    20  3300     0     1     0     0     0     4     2
10     0    20  3400     0     1     0     0     1     5     1
# ... with 990 more rows

答案 3 :(得分:1)

使用data.table,可以使用内置的.GRP.N变量在几行中完成。

setDT(df1)
df1 <- df1[order(c,b,a)]
df1[,':='(count = .N, id = .GRP),.(a,b,c)]
print(head(df1))

   a  b    c d k l m n count    id
1: 0 20 3000 1 1 0 0 1     2     1
2: 0 20 3000 0 0 0 0 0     2     1
3: 1 20 3000 0 1 0 0 0     3     2
4: 1 20 3000 0 1 0 0 0     3     2
5: 1 20 3000 0 1 0 0 0     3     2
6: 0 21 3000 0 1 1 0 0     2     3