为每个级别的因子附加一行总和

时间:2017-09-08 17:49:42

标签: r dplyr data.table data-manipulation

我想为每个Reg添加一行总和,如此

   Reg   Res    Pop
1      Total 1000915
2    A Urban 500414
3    A Rural 500501

4     Total  999938
5    B Urban 499922
6    B Rural 500016

7      Total 1000912
8    C Urban 501638
9    C Rural 499274

10     Total  999629
11    D Urban 499804
12    D Rural 499825

13     Total 1000303
14   E Urban 499917
15   E Rural 500386

MWE如下:

Reg <- rep(LETTERS[1:5], each = 2)
Res <- rep(c("Urban", "Rural"), times = 5)
set.seed(12345)
Pop <- rpois(n = 10, lambda = 500000)
df <- data.frame(Reg, Res, Pop)

df
   Reg   Res    Pop
1    A Urban 500414
2    A Rural 500501
3    B Urban 499922
4    B Rural 500016
5    C Urban 501638
6    C Rural 499274
7    D Urban 499804
8    D Rural 499825
9    E Urban 499917
10   E Rural 500386

df %>%
   group_by(Reg) %>%
   summarise(Total = sum(Pop))
# A tibble: 5 x 2
     Reg   Total
  <fctr>   <int>
1      A 1000915
2      B  999938
3      C 1000912
4      D  999629
5      E 1000303

被修改

我希望同时拥有dplyrdata.table解决方案。

6 个答案:

答案 0 :(得分:3)

lapply(split(df, df$Reg),
       function(a) rbind(data.frame(Reg = a$Reg[1],
                                    Res = "Total",
                                    Pop = sum(a$Pop)),
                         a))
$A
  Reg   Res     Pop
1   A Total 1000915
2   A Urban  500414
3   A Rural  500501

$B
  Reg   Res    Pop
1   B Total 999938
3   B Urban 499922
4   B Rural 500016

$C
  Reg   Res     Pop
1   C Total 1000912
5   C Urban  501638
6   C Rural  499274

$D
  Reg   Res    Pop
1   D Total 999629
7   D Urban 499804
8   D Rural 499825

$E
   Reg   Res     Pop
1    E Total 1000303
9    E Urban  499917
10   E Rural  500386

如果您需要

,可以使用do.call(rbind, ...)将整个内容转换为data.frame

答案 1 :(得分:3)

您可以在摘要中添加额外的 Res 列,然后在原始数据框中添加bind_rows

df %>%
    group_by(Reg) %>%
    summarise(Pop = sum(Pop), Res = 'Total') %>%
    bind_rows(df) %>% 
    arrange(Reg)

# A tibble: 15 x 3
#     Reg     Pop   Res
#   <chr>   <int> <chr>
# 1     A 1000915 Total
# 2     A  500414 Urban
# 3     A  500501 Rural
# 4     B  999938 Total
# 5     B  499922 Urban
# 6     B  500016 Rural
# 7     C 1000912 Total
# 8     C  501638 Urban
# 9     C  499274 Rural
#10     D  999629 Total
#11     D  499804 Urban
#12     D  499825 Rural
#13     E 1000303 Total
#14     E  499917 Urban
#15     E  500386 Rural

相应的data.table解决方案:

dt <- setDT(df)
rbindlist(list(dt[, .(Pop = sum(Pop), Res = 'Total'), Reg], dt), use.names = TRUE)

答案 2 :(得分:1)

堆叠和重新排列将起作用:

library(dplyr)

Reg <- rep(LETTERS[1:5], each = 2)
Res <- rep(c("Urban", "Rural"), times = 5)
set.seed(12345)
Pop <- rpois(n = 10, lambda = 500000)
df <- data.frame(Reg, Res, Pop, stringsAsFactors = FALSE)


sums <- df %>%
  group_by(Reg) %>%
  summarise(Pop = sum(Pop)) %>%
  mutate(Res = "Total")

df_sums <- bind_rows(df, sums) %>% 
  arrange(Reg, Res)

答案 3 :(得分:1)

您的数据:

Reg <- rep(LETTERS[1:5], each = 2)
Res <- rep(c("Urban", "Rural"), times = 5)
set.seed(12345)
Pop <- rpois(n = 10, lambda = 500000)
df  <- data.frame(Reg, Res, Pop)

require(dplyr)
df1 <- 
df %>%
  group_by(Reg) %>%
  summarise(Total = sum(Pop))

我的解决方案(注意:我还将早期的管道发送到df1):

df <- rbind(df, data.frame(Reg=df1$Reg, Res="Total", Pop=df1$Total))

df <- df[order(as.character(df$Reg), decreasing = T),]
df <- df[seq(dim(df)[1],1),]

结果:

print(df, row.names = F)
 Reg   Res     Pop
   A Total 1000915
   A Rural  500501
   A Urban  500414
   B Total  999938
   B Rural  500016
   B Urban  499922
   C Total 1000912
   C Rural  499274
   C Urban  501638
   D Total  999629
   D Rural  499825
   D Urban  499804
   E Total 1000303
   E Rural  500386
   E Urban  499917

如果要在组之间使用换行符打印它们,而不更改数据类型:

for(g in unique(df$Reg)){
  print(df[df$Reg==g,], row.names = F)
  cat("\n")
}
 Reg   Res     Pop
   A Total 1000915
   A Rural  500501
   A Urban  500414

 Reg   Res    Pop
   B Total 999938
   B Rural 500016
   B Urban 499922

 Reg   Res     Pop
   C Total 1000912
   C Rural  499274
   C Urban  501638

 Reg   Res    Pop
   D Total 999629
   D Rural 499825
   D Urban 499804

 Reg   Res     Pop
   E Total 1000303
   E Rural  500386
   E Urban  499917

您还要求 data.table 解决方案。这与上面的内容相同,除了像这样创建df1

dt  <- as.data.table(df)
df1 <- dt[,sum(Pop),by=dt$Reg]

答案 4 :(得分:1)

我们可以使用dplyrpurrr。这与d.b的方法类似,但map_dfr的输出将是数据帧。因此不需要从列表到数据帧的进一步转换。请注意,我使用data_frame函数构造df,因为不需要此分析因子。 df2是最终输出。

library(dplyr)
library(purrr)

df <- data_frame(Reg, Res, Pop)

df2 <- df %>%
  split(.$Reg) %>%
  map_dfr(~bind_rows(.x, data_frame(Reg = .x$Reg[1], Res = "Total", Pop = sum(.x$Pop))))

df2 
# A tibble: 15 x 3
     Reg   Res     Pop
   <chr> <chr>   <int>
 1     A Urban  500414
 2     A Rural  500501
 3     A Total 1000915
 4     B Urban  499922
 5     B Rural  500016
 6     B Total  999938
 7     C Urban  501638
 8     C Rural  499274
 9     C Total 1000912
10     D Urban  499804
11     D Rural  499825
12     D Total  999629
13     E Urban  499917
14     E Rural  500386
15     E Total 1000303

答案 5 :(得分:1)

data.table包的开发版本1.10.5(参见here for installation instructions)有三个新功能,用于计算各种分组级别的聚合,可在此处使用。

请注意,OP的预期结果包含连续的行号1到15,这表示OP期望一个data.frame或data.table而不是Frank首选的列表。但是,我们将在下面显示,data.table也可以用眼睛友好的方式打印。

rollup()

使用新的rollup()功能并按Reg

排序
library(data.table)   # development version 1.10.5 as of 2015-09-10
setDT(df)
rollup(df, j = list(Pop = sum(Pop)), by = c("Reg", "Res"))[order(Reg)]

我们得到了

    Reg   Res     Pop
 1:   A Urban  500414
 2:   A Rural  500501
 3:   A    NA 1000915
 4:   B Urban  499922
 5:   B Rural  500016
 6:   B    NA  999938
 7:   C Urban  501638
 8:   C Rural  499274
 9:   C    NA 1000912
10:   D Urban  499804
11:   D Rural  499825
12:   D    NA  999629
13:   E Urban  499917
14:   E Rural  500386
15:   E    NA 1000303
16:  NA    NA 5001697

相应的总数由NA表示(包括总计)。如果我们想要更好地重现预期结果,可以删除总计,并NA替换Total

rollup(df, j = list(Pop = sum(Pop)), by = c("Reg", "Res"))[order(Reg)][
  is.na(Res), Res := "Total"][!is.na(Reg)]
    Reg   Res     Pop
 1:   A Urban  500414
 2:   A Rural  500501
 3:   A Total 1000915
 4:   B Urban  499922
 5:   B Rural  500016
 6:   B Total  999938
 7:   C Urban  501638
 8:   C Rural  499274
 9:   C Total 1000912
10:   D Urban  499804
11:   D Rural  499825
12:   D Total  999629
13:   E Urban  499917
14:   E Rural  500386
15:   E Total 1000303

请注意,Total行在下面显示详细信息行,这些行不完全符合OP的预期结果。

groupingsets()

使用groupingsets()功能,可以非常详细地控制聚合:

groupingsets(df, j = list(Pop = sum(Pop)), by = c("Reg", "Res"), 
             sets = list("Reg", c("Reg", "Res")))[order(Reg)][
               is.na(Res), Res := "Total"][]
    Reg   Res     Pop
 1:   A Total 1000915
 2:   A Urban  500414
 3:   A Rural  500501
 4:   B Total  999938
 5:   B Urban  499922
 6:   B Rural  500016
 7:   C Total 1000912
 8:   C Urban  501638
 9:   C Rural  499274
10:   D Total  999629
11:   D Urban  499804
12:   D Rural  499825
13:   E Total 1000303
14:   E Urban  499917
15:   E Rural  500386

现在,Total行显示在详细信息行上方,并且根本没有创建总计。

精美印刷&#34;经典&#34; data.table解决方案

截至目前,两个&#34;经典&#34; PsidomHack-R发布了data.table个解决方案。

两者都可以更简洁地重写为

rbind(df[, .(Res = "Total", Pop = sum(Pop)), by = Reg], df)[order(Reg)]

结果可以打印在眼睛友好的&#34;使用

在组之间使用空行的方式
rbind(df[, .(Res = "Total", Pop = sum(Pop)), by = Reg], df)[
  order(Reg), {print(data.table(Reg, .SD), row.names = FALSE); cat("\n")}, by = Reg]
 Reg   Res     Pop
   A Total 1000915
   A Urban  500414
   A Rural  500501

 Reg   Res    Pop
   B Total 999938
   B Urban 499922
   B Rural 500016

 Reg   Res     Pop
   C Total 1000912
   C Urban  501638
   C Rural  499274

 Reg   Res    Pop
   D Total 999629
   D Urban 499804
   D Rural 499825

 Reg   Res     Pop
   E Total 1000303
   E Urban  499917
   E Rural  500386