Question

我有以下问题：在数据框中，我有很多行和列，第一行是日期。对于每个日期，我有超过1个观察，我想总结它们。

我的df看起来像那样（为便于使用，日期被ID替换）：

buildNumber

我想按第一列对它们进行分组，然后用不同的函数汇总所有行：

现金和价格功能应总和，因此我得到每个ID的现金和价格总和。权重上的函数应该是 max ，所以我只得到ID的最大权重。

因为我有这么多列，所以不能手工编写所有函数，但我只有2列应该由 max 汇总，其余列应该由 sum

所以我正在寻找一个按ID分组的功能，总结所有 sum 除了2个不同的列，我需要 max 值。

我尝试使用 dplyr 包：

df:
ID     Cash    Price    Weight   ...
1      0.4     0        0
1      0.2     0        82       ...
1      0       1        0        ...
1      0       3.2      80       ...
2      0.3     1        70       ...
...    ...     ...      ...      ...

但我需要添加不总和但最多2个指定的列，任何想法？

要清楚，示例df的输出应为：

df %>% group_by(ID = tolower(ID)) %>% summarise_each(funs(sum))

Answer 1

或者没有双组：

library(dplyr)

set.seed(1492)
df <- data.frame(id=rep(c(1,2), 3),
                 cash=rnorm(6, 0.5, 0.1),
                 price=rnorm(6, 0.5, 0.1)*6,
                 weight=sample(100, 6))

df

##   id      cash    price weight
## 1  1 0.4410152 2.484082     10
## 2  2 0.4101343 3.032529     93
## 3  1 0.3375889 2.305076     58
## 4  2 0.6047922 3.248851     55
## 5  1 0.4721711 3.209930     34
## 6  2 0.5362493 2.331530     99

custom_summarise <- function(do_df) {

  return(bind_cols(
    summarise_each(select(do_df, -weight), funs(sum)),
    summarise_each(select(do_df, weight), funs(max))
  ))

}

group_by(df, id) %>% do(custom_summarise(.))

## Source: local data frame [2 x 4]
## Groups: id [2]
## 
##      id     cash    price weight
##   (dbl)    (dbl)    (dbl)  (int)
## 1     3 1.250775 7.999089     58
## 2     6 1.551176 8.612910     99

Answer 2

library(data.table)

setDT(df)

df[,.(Cash = sum(Cash),Price = sum(Price),Weight = max(Weight)),by=ID]

为+90列执行此操作的一种方法可以是：

max_col <- 'Weight'

sum_col <- setdiff(colnames(df),max_col)

query_1 <- paste0(sum_col,' = sum(',sum_col,')')

query_2 <- paste0(max_col,' = max(',max_col,')')

query_3 <- paste(query_1,collapse=',')

query_4 <- paste(query_2,collapse=',')

query_5 <- paste(query_3,query_4,sep=',')

final_query <- paste0('df[,.(',query_5,'),by = ID]')

eval(parse(text = final_query))

Answer 3

我们可以使用

 df %>%
    group_by(ID) %>%
    summarise(Cash = sum(Cash), Price = sum(Price), Weight = max(Weight))

如果我们有很多列，一种方法是分别执行此操作，然后将join输出放在一起。

 df1 <- df %>% 
          group_by(ID) %>% 
          summarise_each(funs(sum), Cash:Price)
 df2 <- df %>%
          group_by(ID) %>% 
          summarise_each(funs(max), Weight)
 inner_join(df1, df2, by = "ID")
 #      ID  Cash Price Weight
 #   (int) (dbl) (dbl)  (int)
 #1     1   0.6   4.2     82
 #2     2   0.3   1.0     70

Answer 4

这是基于dplyr回购中的问题comment的解决方案。我认为将其应用于更复杂的案例非常普遍。

library(tidyverse)
df <- tribble(
      ~ID,   ~Cash,  ~Price,  ~Weight,
      #----------------------
          'a',      4,   6,   8,
          'a',      7,   3,   0,
          'a',      7,   9,   0,
          'b',      2,   8,   8,
          'b',      5,   1,   8,
          'b',      8,   0,   1,
          'c',      2,   1,   1,
          'c',      3,   8,   0,
          'c',      1,   9,   1
     )

out <- list(.vars=lst(vars(-Weight), vars(Weight)),
            .funs=lst(sum, max))%>% 
  pmap(~df%>%group_by(ID)%>%summarise_at(.x, .y)) %>% 
  reduce(inner_join)
out
# A tibble: 3 x 4
#   ID     Cash Price Weight
#   <chr> <dbl> <dbl>  <dbl>
# 1 a        18    18      8
# 2 b        15     9      8
# 3 c         6    18      1

您应该在第一个lst中指定变量（例如vars(-Weight), vars(Weight)），并在lst（sum, max）中应用相应的函数。 .x参数中的summarise_at引用变量lst中的元素，而.y引用函数lst中的元素。

Answer 5

从 dplyr 1.0.0 开始，您可以使用 cross()：

tribble(
  ~ID, ~max1, ~max2, ~sum1, ~sum2, ~sum3,
  1, 1, 1, 1, 2, 3,
  1, 2, 3, 1, 2, 3,
  2, 1, 1, 1, 2, 3,
  2, 3, 4, 2, 3, 4,
  3, 1, 1, 1, 2, 3,
  3, 4, 5, 3, 4, 5,
  3, NA, NA, NA, NA, NA
) %>%
  group_by(ID) %>%
  summarize(
    across(matches("max1|max2"), max, na.rm = T),
    across(!matches("max1|max2"), sum, na.rm = T)
  )

# ID  max1  max2  sum1  sum2  sum3
#  1     2     3     2     4     6
#  2     3     4     3     5     7
#  3     4     5     4     6     8

总结具有不同功能的不同列

5 个答案: