R:通过两个或更多个分组变量获得两个或更多变量的聚合

时间:2015-05-20 22:47:27

标签: r

使用以下数据帧,我需要获得以下两个变量的每月总和:“CallsHandled”和“Engaged”  通过以下分组变量:“月”,“ID”,“位置”,“语言”,“MemRegion”

structure(list(Month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Week = c(1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L), ID = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A1234", 
"F1234"), class = "factor"), Location = structure(c(2L, 1L, 1L, 
1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 
1L), .Label = c("Corona", "Denver"), class = "factor"), LANGUAGE = structure(c(1L, 
2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 
2L, 2L, 1L), .Label = c("English", "Spanish"), class = "factor"), 
MemRegion = structure(c(1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 
1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NCAL", 
"SCAL"), class = "factor"), CallsHandled = c(1L, 1L, 8L, 
1L, 1L, 2L, 1L, 1L, 1L, 1L, 10L, 1L, 3L, 1L, 8L, 1L, 6L, 
1L, 1L, 2L), Engaged = c(120L, 30L, 1243L, 75L, 45L, 55L, 
200L, 120L, 30L, 230L, 2065L, 45L, 55L, 200L, 1483L, 30L, 
1243L, 75L, 45L, 55L), QueueA = c(0L, 0L, 0L, 1L, 1L, 0L, 
0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), 
QueueB = c(1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 
0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L), QueueC = c(0L, 1L, 0L, 0L, 
0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 
0L)), .Names = c("Month", "Week", "ID", "Location", "LANGUAGE", 
"MemRegion", "CallsHandled", "Engaged", "QueueA", "QueueB", "QueueC"
), class = "data.frame", row.names = c(NA, -20L))

此外,为了将“队列A:C”作为分组变量包含在内,我是否必须将它们组合成一个列?如果是这样,怎么样?

3 个答案:

答案 0 :(得分:1)

因此,这个问题有2个部分,首先是如何对事物进行分组和总结,其次如何将队列A:C组合成一列。

对于第一个问题,您可以使用库dplyr,这样可以更轻松,更直观。

df <- structure(list(Month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
                     Week = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), 
                     ID = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A1234", "F1234"), class = "factor"), 
                     Location = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("Corona", "Denver"), class = "factor"), 
                     LANGUAGE = structure(c(1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L), .Label = c("English", "Spanish"), class = "factor"), 
                     MemRegion = structure(c(1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NCAL", "SCAL"), class = "factor"), 
                     CallsHandled = c(1L, 1L, 8L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 10L, 1L, 3L, 1L, 8L, 1L, 6L, 1L, 1L, 2L), 
                     Engaged = c(120L, 30L, 1243L, 75L, 45L, 55L, 200L, 120L, 30L, 230L, 2065L, 45L, 55L, 200L, 1483L, 30L, 1243L, 75L, 45L, 55L), 
                     QueueA = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), 
                     QueueB = c(1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L), 
                     QueueC = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L)), .Names = c("Month", "Week", "ID", "Location", "LANGUAGE", "MemRegion", "CallsHandled", "Engaged", "QueueA", "QueueB", "QueueC"),
                class = "data.frame", row.names = c(NA, -20L))

library(dplyr)

df %>% group_by(Month, ID, Location, LANGUAGE) %>%
  mutate(TotalCallsHandled = sum(CallsHandled),
         TotalEngaged = sum(Engaged))

首先,我们使用group_by专门对您列出的变量进行分组,然后进行变异以对所有内容进行求和,这将直观地实现您的想法。

要将所有内容组合成一列,可能有很多方法可以做到这一点,但最直接的方法可能是为每列创建某种唯一标识符,并将所有列合并为一列。

df$Queue <- as.factor(df$QueueA + df$QueueB*2 + df$QueueC*3)
levels(df$Queue) <- c("A", "B", "C")

由于一切都应该是0,1标志,我们可以将标志重新创建为1 -> A2 -> B3 -> C,然后将因子重新分配为A,B,C再次。然后我们可以再次使用group_by函数来获得上述预期结果。

df %>% group_by(Month, ID, Location, LANGUAGE, Queue) %>%
  mutate(TotalCallsHandled = sum(CallsHandled),
         TotalEngaged = sum(Engaged)) %>%
  select(-QueueA, -QueueB, -QueueC)

输出:

Source: local data frame [20 x 11]
Groups: Month, ID, Location, LANGUAGE, Queue

   Month Week    ID Location LANGUAGE MemRegion CallsHandled Engaged Queue
1      1    1 F1234   Denver  English      NCAL            1     120     B
2      1    2 F1234   Corona  Spanish      SCAL            1      30     C
3      1    3 F1234   Corona  English      NCAL            8    1243     B
4      1    4 F1234   Corona  Spanish      NCAL            1      75     A
5      1    5 F1234   Corona  Spanish      SCAL            1      45     A
6      1    6 F1234   Denver  English      SCAL            2      55     B
7      1    7 F1234   Corona  English      NCAL            1     200     C
8      1    8 F1234   Corona  English      NCAL            1     120     B
9      1    9 F1234   Denver  English      NCAL            1      30     A
10     1   10 F1234   Corona  Spanish      NCAL            1     230     C
11     1    1 A1234   Corona  English      NCAL           10    2065     C
12     1    2 A1234   Corona  English      SCAL            1      45     A
13     1    3 A1234   Corona  Spanish      NCAL            3      55     A
14     1    4 A1234   Corona  English      NCAL            1     200     A
15     1    5 A1234   Corona  English      SCAL            8    1483     B
16     1    6 A1234   Denver  English      SCAL            1      30     B
17     1    7 A1234   Corona  Spanish      SCAL            6    1243     C
18     1    8 A1234   Corona  Spanish      SCAL            1      75     B
19     1    9 A1234   Corona  Spanish      SCAL            1      45     C
20     1   10 A1234   Corona  English      SCAL            2      55     B
Variables not shown: TotalCallsHandled (int), TotalEngaged (int)

答案 1 :(得分:0)

要将Queue变量转换为单个因子变量,您可以这样做:

queues <- which(dat[ , c("QueueA", "QueueB", "QueueC")]==1, arr.ind=TRUE)
queues<-queues[
            order(queues[,"row"]), "col"]
queues<-factor(queues, labels=c("QueueA", "QueueB", "QueueC"))
dat <- data.frame(dat, queues)

虽然,@ chappers的方法更好。

然后,您可以使用aggregate

aggregate(dat[,c("CallsHandled", "Engaged")], 
    by=list(dat$Month, dat$ID, dat$Location, dat$LANGUAGE, dat$MemRegion, dat$queues),
    sum)

#   Group.1 Group.2 Group.3 Group.4 Group.5 Group.6 CallsHandled Engaged
#1        1   A1234  Corona English    NCAL  QueueA            1     200
#2        1   F1234  Denver English    NCAL  QueueA            1      30
#3        1   A1234  Corona Spanish    NCAL  QueueA            3      55
#4        1   F1234  Corona Spanish    NCAL  QueueA            1      75
#5        1   A1234  Corona English    SCAL  QueueA            1      45
#6        1   F1234  Corona Spanish    SCAL  QueueA            1      45
#7        1   F1234  Corona English    NCAL  QueueB            9    1363
#8        1   F1234  Denver English    NCAL  QueueB            1     120
#9        1   A1234  Corona English    SCAL  QueueB           10    1538
#10       1   A1234  Denver English    SCAL  QueueB            1      30
#11       1   F1234  Denver English    SCAL  QueueB            2      55
#12       1   A1234  Corona Spanish    SCAL  QueueB            1      75
#13       1   A1234  Corona English    NCAL  QueueC           10    2065
#14       1   F1234  Corona English    NCAL  QueueC            1     200
#15       1   F1234  Corona Spanish    NCAL  QueueC            1     230
#16       1   A1234  Corona Spanish    SCAL  QueueC            7    1288
#17       1   F1234  Corona Spanish    SCAL  QueueC            1      30

答案 2 :(得分:0)

@chappers解决方案正确聚合,但由于某些我无法弄清楚的原因,我留下了一堆重复的行。这适用于因子并减少实际数据帧中的行数(无重复):

aggregate(cbind(CallsHandled,Engaged~Month + ID + Location + LANGUAGE + MemRegion, data=df, sum, na.rm=TRUE)