使用R中的dplyr在每个因子级别附加摘要行

时间:2017-09-09 02:28:19

标签: r dplyr data-manipulation

这是this question的后续行动。我无法获得所需的输出。

Reg  <- rep(LETTERS[1:5],  c(4, 6, 4, 4, 6))
City <- rep(letters[1:12], each = 2) 
Res  <- rep(c("Urban", "Rural"), times = length(Reg)/2)
set.seed(12345)
Pop  <- rpois(n = length(Reg), lambda = 500000)
Pop1 <- rpois(n = length(Reg), lambda = 500)
df   <- data.frame(Reg, City, Res, Pop, Pop1)

df3 <-
 df %>%
   group_by(Reg) %>%
   summarise(Pop = sum(Pop), Pop1 = sum(Pop1),  Res = 'Total') %>%
   bind_rows(df) %>% 
   arrange(Reg) %>%
   select(Reg, Res, Pop, Pop1) %>%
   distinct()
 df3


# A tibble: 29 x 4
      Reg   Res     Pop  Pop1
   <fctr> <chr>   <int> <int>
 1      A Total 2000853  2135
 2      A Urban  500414   549
 3      A Rural  500501   545
 4      A Urban  499922   536
 5      A Rural  500016   505
 6      B Total 3000844  2938
 7      B Urban  501638   510
 8      B Rural  499274   492
 9      B Urban  499804   506
10      B Rural  499825   508
# ... with 19 more rows

然后,需要以下代码来查找每个Reg

中的农村和城市总和
df3 %>% group_by(Reg, Res) %>% summarise(sum(Pop), sum(Pop1))

# A tibble: 15 x 4
# Groups:   Reg [?]
      Reg   Res `sum(Pop)` `sum(Pop1)`
   <fctr> <chr>      <int>       <int>
 1      A Rural    1000517        1050
 2      A Total    2000853        2135
 3      A Urban    1000336        1085
 4      B Rural    1499485        1446
 5      B Total    3000844        2938
 6      B Urban    1501359        1492
 7      C Rural     999234         987
 8      C Total    1997259        2007
 9      C Urban     998025        1020
10      D Rural     998760        1058
11      D Total    2000712        2052
12      D Urban    1001952         994
13      E Rural    1501848        1547
14      E Total    2999304        3050
15      E Urban    1497456        1503

Res列也有不同的顺序。

但是,我想以更有效和更紧凑的方式进行所有计算。

所需的输出

  Reg   Res     Pop  Pop1
 1      A Total 2000853  2135
 2      A Urban 1000336  1085
 3      A Rural 1000517  1050

 4      B Total 3000844  2938
 5      B Urban 1501359  1492
 6      B Rural 1499485  1446

2 个答案:

答案 0 :(得分:3)

不确定它是否更紧凑,但通过按进程反转两个组的顺序,可以消除多行代码:

df %>% 
    group_by(Reg, Res) %>%           # group by Reg and Res first
    summarise(Pop = sum(Pop), Pop1 = sum(Pop1)) %>% 
    bind_rows(
        group_by(., Reg) %>%         # now group by Reg and bind_rows with previous result
            summarise(Pop = sum(Pop), Pop1 = sum(Pop1), Res = 'Total'),
        .       # <<<<<<               bind total above other Res by passing previous 
                                     # result as second argument in the bind_rows
    ) %>% arrange(Reg)

# A tibble: 15 x 4
#      Reg     Pop  Pop1   Res
#    <chr>   <int> <int> <chr>
#  1     A 2000853  2135 Total
#  2     A 1000517  1050 Rural
#  3     A 1000336  1085 Urban
#  4     B 3000844  2938 Total
#  5     B 1499485  1446 Rural
#  6     B 1501359  1492 Urban
#  7     C 1997259  2007 Total
#  8     C  999234   987 Rural
#  9     C  998025  1020 Urban
# 10     D 2000712  2052 Total
# 11     D  998760  1058 Rural
# 12     D 1001952   994 Urban
# 13     E 2999304  3050 Total
# 14     E 1501848  1547 Rural
# 15     E 1497456  1503 Urban

答案 1 :(得分:0)

以下是使用tidyverse

的选项
library(tidyverse)
map(list(c("Reg", "Res"), "Reg"), ~df %>% group_by(!!! syms(.x)) %>%
             summarise_at(vars("Pop", "Pop1"), sum)) %>%
             bind_rows %>% 
             mutate(Res = replace(as.character(Res), is.na(Res), "Total")) %>% 
             arrange(Reg, Res != "Total")
# A tibble: 15 x 4
# Groups:   Reg [5]
#      Reg   Res     Pop  Pop1
#   <fctr> <chr>   <int> <int>
# 1      A Total 2000853  2135
# 2      A Rural 1000517  1050
# 3      A Urban 1000336  1085
# 4      B Total 3000844  2938
# 5      B Rural 1499485  1446
# 6      B Urban 1501359  1492
# 7      C Total 1997259  2007
# 8      C Rural  999234   987
# 9      C Urban  998025  1020
#10      D Total 2000712  2052
#11      D Rural  998760  1058
#12      D Urban 1001952   994
#13      E Total 2999304  3050
#14      E Rural 1501848  1547
#15      E Urban 1497456  1503

或与data.table

类似的选项
library(data.table)
rbindlist(Map(function(grp) df[, lapply(.SD, sum), by = c(grp), .SDcols = Pop:Pop1],
     list(c("Reg", "Res"), "Reg")), fill = TRUE)[
      is.na(Res), Res := "Total"][order(Reg, Res != "Total")]
#    Reg   Res     Pop Pop1
# 1:   A Total 2000853 2135
# 2:   A Urban 1000336 1085
# 3:   A Rural 1000517 1050
# 4:   B Total 3000844 2938
# 5:   B Urban 1501359 1492
# 6:   B Rural 1499485 1446
# 7:   C Total 1997259 2007
# 8:   C Urban  998025 1020
# 9:   C Rural  999234  987
#10:   D Total 2000712 2052
#11:   D Urban 1001952  994
#12:   D Rural  998760 1058
#13:   E Total 2999304 3050
#14:   E Urban 1497456 1503
#15:   E Rural 1501848 1547