Question

我正在寻找一种有效的方法来处理数据框中的多个变量。现在我正在使用dplyr，但是这会变得更加繁琐。假设我有以下数据框，其中brd是汽车品牌，你是一年，类型是汽车类型，cy和hp是类型特征。

brd <-c("BMW","BMW","BMW","Volvo","Volvo", "Volvo","BMW","BMW","BMW","Volvo","Volvo","Volvo")
ye <- c(99,99,99,99,99,99,98,98,98,98,98,98)
type <- c(1,2,3,1,2,3,1,2,3,1,2,3)
cy <- c(1895,1991,1587,2435,2435,1596,1991,1588,1984,1596,1991,1588)
hp <- c(77,110,80,103,103,75,110,77,93,75,110,77)

df <- as.data.frame(brd)
df$ye <- ye
df$type <- type
df$cy <- cy
df$hp <- hp    
df
     brd ye type   cy  hp
1    BMW 99    1 1895  77
2    BMW 99    2 1991 110
3    BMW 99    3 1587  80
4  Volvo 99    1 2435 103
5  Volvo 99    2 2435 103
6  Volvo 99    3 1596  75
7    BMW 98    1 1991 110
8    BMW 98    2 1588  77
9    BMW 98    3 1984  93
10 Volvo 98    1 1596  75
11 Volvo 98    2 1991 110
12 Volvo 98    3 1588  77

对于每年，我想计算同一品牌的所有其他产品的产品特征总和，并将其作为新变量添加到数据框中。现在，我正在使用这样的dplyr：

library(dplyr)
df <- df %>% group_by(brd, ye) %>%
  mutate(sumall_cy = sum(cy),
         sumall_hp = sum(hp))

df <- df %>%
  mutate(sumother_cy = sumall_cy-cy,
         sumother_hp = sumall_li-hp)

所以我得到了

      brd    ye  type    cy    hp sumall_cy sumall_hp sumother_cy sumother_hp
   <fctr> <dbl> <dbl> <dbl> <dbl>     <dbl>     <dbl>       <dbl>       <dbl>
1     BMW    99     1  1895    77      5473       267        3578         190
2     BMW    99     2  1991   110      5473       267        3482         157
3     BMW    99     3  1587    80      5473       267        3886         187
4   Volvo    99     1  2435   103      6466       281        4031         178
5   Volvo    99     2  2435   103      6466       281        4031         178
6   Volvo    99     3  1596    75      6466       281        4870         206
7     BMW    98     1  1991   110      5563       280        3572         170
8     BMW    98     2  1588    77      5563       280        3975         203
9     BMW    98     3  1984    93      5563       280        3579         187
10  Volvo    98     1  1596    75      5175       262        3579         187
11  Volvo    98     2  1991   110      5175       262        3184         152
12  Volvo    98     3  1588    77      5175       262        3587         185

有更有效的方法吗？我正在考虑像这个stata代码一样循环：

foreach x of varlist hp cy {

bysort ye: egen sumall_`x'= sum(`x')
gen sumother_`x'=(sumall_`x' -`x')}

任何建议都将不胜感激。

Answer 1

以下是non-standard evaluation的解决方案，group_by操作只需要执行一次，并且当您需要处理更多列时也能正常工作：

library(dplyr)  # 0.7.0
library(rlang)  # required for the `syms` function

varlist <- c('cy', 'hp')

# make a list of quos of opertions
ops <- sapply(syms(varlist), function(x) quo(sum(UQ(x)) - UQ(x)) )

# set new variable name
names(ops) <- paste('sumother', varlist, sep = '_')

# get results
df %>% group_by(brd, ye) %>% mutate(!!!ops) %>% ungroup()
# # A tibble: 12 x 7
#       brd    ye  type    cy    hp sumother_cy sumother_hp
#    <fctr> <dbl> <dbl> <dbl> <dbl>       <dbl>       <dbl>
#  1    BMW    99     1  1895    77        3578         190
#  2    BMW    99     2  1991   110        3482         157
#  3    BMW    99     3  1587    80        3886         187
#  4  Volvo    99     1  2435   103        4031         178
#  5  Volvo    99     2  2435   103        4031         178
#  6  Volvo    99     3  1596    75        4870         206
#  7    BMW    98     1  1991   110        3572         170
#  8    BMW    98     2  1588    77        3975         203
#  9    BMW    98     3  1984    93        3579         187
# 10  Volvo    98     1  1596    75        3579         187
# 11  Volvo    98     2  1991   110        3184         152
# 12  Volvo    98     3  1588    77        3587         185

如果我们想保留sumall_列，我们可以尝试：

ops <- sapply(syms(varlist), function(x) list(quo(sum(UQ(x))), quo(sum(UQ(x)) - UQ(x))) )
names(ops) <- paste(
    rep(c('sumall', 'sumother'), length(varlist)),
    rep(varlist, each = 2), sep = '_')
df %>% group_by(brd, ye) %>% mutate(!!!ops) %>% ungroup()

# # A tibble: 12 x 9
#       brd    ye  type    cy    hp sumall_cy sumother_cy sumall_hp sumother_hp
#    <fctr> <dbl> <dbl> <dbl> <dbl>     <dbl>       <dbl>     <dbl>       <dbl>
#  1    BMW    99     1  1895    77      5473        3578       267         190
#  2    BMW    99     2  1991   110      5473        3482       267         157
#  3    BMW    99     3  1587    80      5473        3886       267         187
#  4  Volvo    99     1  2435   103      6466        4031       281         178
#  5  Volvo    99     2  2435   103      6466        4031       281         178
#  6  Volvo    99     3  1596    75      6466        4870       281         206
#  7    BMW    98     1  1991   110      5563        3572       280         170
#  8    BMW    98     2  1588    77      5563        3975       280         203
#  9    BMW    98     3  1984    93      5563        3579       280         187
# 10  Volvo    98     1  1596    75      5175        3579       262         187
# 11  Volvo    98     2  1991   110      5175        3184       262         152
# 12  Volvo    98     3  1588    77      5175        3587       262         185

Answer 2

如果有许多类型特征，例如cy和hp，我建议将数据重新整形为长格式并在那里进行所有类似的转换。为此，使用melt()包中的dcast()和data.table：

library(data.table)   # CRAN version 1.10.4 used
# coerce to data.table
DT <- data.table(df)
# reshape from wide to long format, 
# specify id.vars because number of measure.vars may change in the future
long <- melt(DT, id.vars = c("brd", "ye", "type"))
# create additional columns
long[, sumall := sum(value), by = .(brd, ye, variable)][, sumother := sumall - value][]
# reshape from long to wide format
dcast(long, brd + ye + type ~ ..., value.var = c("value", "sumall", "sumother"))

      brd ye type value_cy value_hp sumall_cy sumall_hp sumother_cy sumother_hp
 1:   BMW 98    1     1991      110      5563       280        3572         170
 2:   BMW 98    2     1588       77      5563       280        3975         203
 3:   BMW 98    3     1984       93      5563       280        3579         187
 4:   BMW 99    1     1895       77      5473       267        3578         190
 5:   BMW 99    2     1991      110      5473       267        3482         157
 6:   BMW 99    3     1587       80      5473       267        3886         187
 7: Volvo 98    1     1596       75      5175       262        3579         187
 8: Volvo 98    2     1991      110      5175       262        3184         152
 9: Volvo 98    3     1588       77      5175       262        3587         185
10: Volvo 99    1     2435      103      6466       281        4031         178
11: Volvo 99    2     2435      103      6466       281        4031         178
12: Volvo 99    3     1596       75      6466       281        4870         206

如果最终结果中不需要sumall列，则可以在重新整形前将其删除：

dcast(long[, sumall := NULL], brd + ye + type ~ ..., value.var = c("value", "sumother"))

      brd ye type value_cy value_hp sumother_cy sumother_hp
 1:   BMW 98    1     1991      110        3572         170
 2:   BMW 98    2     1588       77        3975         203
 3:   BMW 98    3     1984       93        3579         187
 4:   BMW 99    1     1895       77        3578         190
 5:   BMW 99    2     1991      110        3482         157
 6:   BMW 99    3     1587       80        3886         187
 7: Volvo 98    1     1596       75        3579         187
 8: Volvo 98    2     1991      110        3184         152
 9: Volvo 98    3     1588       77        3587         185
10: Volvo 99    1     2435      103        4031         178
11: Volvo 99    2     2435      103        4031         178
12: Volvo 99    3     1596       75        4870         206

在循环中汇总并生成多个变量

2 个答案: