根据R中另一列的值,按ID计算列的总和

时间:2015-03-19 14:12:56

标签: r

我有一些看起来像这样的数据:

E_Add  Action  ActionType  Call  Callback  Email
xxxx   Task    Call        1     0         0
xxxx   Task    Call        1     0         0
xxxx   Event   Start       0     0         0
xxxx   Task    Call        1     0         0
xxxx   Event   Trial       0     0         0
yyyy   Task    Call        1     0         0
yyyy   Task    Callback    0     1         0
yyyy   Task    Email       0     0         1
yyyy   Task    Call        1     0         0
yyyy   Event   Start       0     0         0

我希望它看起来像这样:

E_Add  Action  ActionType  Call  Callback  Email CallSum CallbackSum EmailSum
xxxx   Task    Call        1     0         0
xxxx   Task    Call        1     0         0     2
xxxx   Event   Start       0     0         0     
xxxx   Task    Call        1     0         0     1
xxxx   Event   Trial       0     0         0
yyyy   Task    Call        1     0         0            
yyyy   Task    Callback    0     1         0             1
yyyy   Task    Email       0     0         1                         1
yyyy   Task    Call        1     0         0     2
yyyy   Event   Start       0     0         0

我的意思是它看起来可能与此不同,但这就是我想要的想法。我想通过电子邮件对“电话”,“回拨”和“电子邮件”进行汇总,并将它们放入新列中。但我想在每次点击“操作”列中的“事件”时重置“呼叫”,“回调”或“电子邮件”的总和,当然也可以通过电子邮件进行此操作。

3 个答案:

答案 0 :(得分:2)

您想要的输出不是很清楚,但我认为这可以满足您的需求(您还有Email列两次)

library(data.table)
cols <- c("Call", "Callback", "Email") # Choose columns to modify

第一个解决方案(简单版)

setDT(df)[, paste0(cols, "Sum") := 
            lapply(.SD, function(x) c(rep(0L, .N - 1L), sum(x))),
            by = .(E_Add, cumsum(Action == "Event")), 
            .SDcols = cols][]

#     E_Add   Action ActionType Call Callback Email.1 CallSum CallbackSum EmailSum
#  1:  xxxx   Task       Call    1        0       0       0           0          0
#  2:  xxxx   Task       Call    1        0       0       2           0          0
#  3:  xxxx  Event      Start    0        0       0       0           0          0
#  4:  xxxx   Task       Call    1        0       0       1           0          0
#  5:  xxxx  Event      Trial    0        0       0       0           0          0
#  6:  yyyy   Task       Call    1        0       0       0           0          0
#  7:  yyyy   Task   Callback    0        1       0       0           0          0
#  8:  yyyy   Task      Email    0        0       1       0           0          0
#  9:  yyyy   Task       Call    1        0       0       2           1          1
# 10:  yyyy  Event      Start    0        0       0       0           0          0

第二个解决方案匹配完全输出

setDT(df)[, paste0(cols, "Sum") := 
            lapply(.SD, function(x) {
            if(any(x == 1L)){
              indx <- max(which(x == 1L))
              x[indx] <- sum(x) 
              x[-indx] <- 0L
              x
              } else 0L
           }), 
            by = .(E_Add, cumsum(Action == "Event")), 
           .SDcols = cols][]

#     E_Add   Action ActionType Call Callback Email.1 CallSum CallbackSum EmailSum
#  1:  xxxx   Task       Call    1        0       0       0           0          0
#  2:  xxxx   Task       Call    1        0       0       2           0          0
#  3:  xxxx  Event      Start    0        0       0       0           0          0
#  4:  xxxx   Task       Call    1        0       0       1           0          0
#  5:  xxxx  Event      Trial    0        0       0       0           0          0
#  6:  yyyy   Task       Call    1        0       0       0           0          0
#  7:  yyyy   Task   Callback    0        1       0       0           1          0
#  8:  yyyy   Task      Email    0        0       1       0           0          1
#  9:  yyyy   Task       Call    1        0       0       2           0          0
# 10:  yyyy  Event      Start    0        0       0       0           0          0

根据评论进行修改(如果您想在Event上显示总和

df[, paste0(cols, "Sum") := 
     lapply(.SD, function(x) c(rep(0L, .N - 1L), sum(x))),
     by = .(E_Add, cumsum(c(FALSE, (Action == "Event")[-length(Action)]))), 
          .SDcols = cols][]

#     E_Add Action ActionType Call Callback Email CallSum CallbackSum EmailSum
#  1:  xxxx   Task       Call    1        0     0       0           0        0
#  2:  xxxx   Task       Call    1        0     0       0           0        0
#  3:  xxxx  Event      Start    0        0     0       2           0        0
#  4:  xxxx   Task       Call    1        0     0       0           0        0
#  5:  xxxx  Event      Trial    0        0     0       1           0        0
#  6:  yyyy   Task       Call    1        0     0       0           0        0
#  7:  yyyy   Task   Callback    0        1     0       0           0        0
#  8:  yyyy   Task      Email    0        0     1       0           0        0
#  9:  yyyy   Task       Call    1        0     0       0           0        0
# 10:  yyyy  Event      Start    0        0     0       2           1        1

答案 1 :(得分:1)

df = structure(list(Email = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L), .Label = c("xxxx", "yyyy"), class = "factor"), Action = structure(c(2L, 
2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L), .Label = c("Event", "Task"
), class = "factor"), ActionType = structure(c(1L, 1L, 4L, 1L, 
5L, 1L, 2L, 3L, 1L, 4L), .Label = c("Call", "Callback", "Email", 
"Start", "Trial"), class = "factor"), Call = c(1L, 1L, 0L, 1L, 
0L, 1L, 0L, 0L, 1L, 0L), Callback = c(0L, 0L, 0L, 0L, 0L, 0L, 
1L, 0L, 0L, 0L), Emails = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 
0L)), .Names = c("Email", "Action", "ActionType", "Call", "Callback", 
"Emails"), class = "data.frame", row.names = c(NA, -10L))

df$CallSum=''
df$CallBackSum=''
df$EmailSum=''

CSum =0
CBSum =0
ESum =0
for(i in 1:nrow(df)){

CSum = CSum+ df[[4]][i]
CBSum = CBSum+ df[[5]][i]
ESum = ESum+ df[[6]][i]

if(df[[2]][i] == 'Event'){

#
df[[7]][i] = CSum
df[[8]][i] = CBSum
df[[9]][i] = ESum

#clear out vars
CSum =0
CBSum =0
ESum =0
}

}



   Email Action ActionType Call Callback Emails CallSum CallBackSum EmailSum
1   xxxx   Task       Call    1        0      0                             
2   xxxx   Task       Call    1        0      0                             
3   xxxx  Event      Start    0        0      0       2           0        0
4   xxxx   Task       Call    1        0      0                             
5   xxxx  Event      Trial    0        0      0       1           0        0
6   yyyy   Task       Call    1        0      0                             
7   yyyy   Task   Callback    0        1      0                             
8   yyyy   Task      Email    0        0      1                             
9   yyyy   Task       Call    1        0      0                             
10  yyyy  Event      Start    0        0      0       2           1        1

答案 2 :(得分:1)

这是我的尝试。我最终覆盖了三列(即Call,Callback和Email.1)。这可能是获得你所追求的一种方式。最初,我在第一个mutate中创建了一个组变量。我通过电子邮件和组对数据进行分组,并计算Call,Callback和Email.1的总和。最后,我想要有零而不是空的空间。所以我在最后的replace()中使用了mutate

library(zoo)
library(dplyr)

group_by(mydf, Email) %>%
mutate(group = ifelse(Action == "Event", row_number(), NA),
       group = na.locf(group, fromLast = TRUE)) %>%
group_by(Email, group) %>%
mutate_each(funs(sum(., na.rm = TRUE)), Call:Email.1) %>%
mutate_each(funs(replace(., which(Action != "Event"), 0)), Call:Email.1) %>%
ungroup %>%
select(-group)

#   Email Action ActionType Call Callback Email.1
#1   xxxx   Task       Call    0        0       0
#2   xxxx   Task       Call    0        0       0
#3   xxxx  Event      Start    2        0       0
#4   xxxx   Task       Call    0        0       0
#5   xxxx  Event      Trial    1        0       0
#6   yyyy   Task       Call    0        0       0
#7   yyyy   Task   Callback    0        0       0
#8   yyyy   Task      Email    0        0       0
#9   yyyy   Task       Call    0        0       0
#10  yyyy  Event      Start    2        1       1

DATA

mydf <- structure(list(Email = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L), .Label = c("xxxx", "yyyy"), class = "factor"), Action = structure(c(2L, 
2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L), .Label = c("Event", "Task"
), class = "factor"), ActionType = structure(c(1L, 1L, 4L, 1L, 
5L, 1L, 2L, 3L, 1L, 4L), .Label = c("Call", "Callback", "Email", 
"Start", "Trial"), class = "factor"), Call = c(1L, 1L, 0L, 1L, 
0L, 1L, 0L, 0L, 1L, 0L), Callback = c(0L, 0L, 0L, 0L, 0L, 0L, 
1L, 0L, 0L, 0L), Email.1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
0L, 0L)), .Names = c("Email", "Action", "ActionType", "Call", 
"Callback", "Email.1"), class = "data.frame", row.names = c(NA, 
-10L))