Question

使用dply :: summarize时，如何保留其中一个分组名称？或者，有没有更好的方法来保留其中一个组名？我可能会相当低效地解决这个问题。

我有一个data.frame（df）：

dput(head(df, n = 20))
structure(list(file_src = c("CBG_EFD.xlsx", "CBG_EFD.xlsx", "CBG_EFD.xlsx", 
"CBG_EFD.xlsx", "CBG_EFD.xlsx", "CBG_EFD.xlsx", "CBG_EFD.xlsx", 
"CBG_EFD.xlsx", "CBG_EFD.xlsx", "CBG_EFD.xlsx", "CBG_EFD.xlsx", 
"CBG_EFD.xlsx", "CBG_EFD.xlsx", "CBG_EFD.xlsx", "CBG_EFD.xlsx", 
"CBG_EFD.xlsx", "CBG_EFD.xlsx", "CBG_EFD.xlsx", "CBG_EFD.xlsx", 
"CBG_EFD.xlsx"), AU = c("CBD", "CBD", "CBD", "CBD", "CBD", "CBD", 
"CBD", "CBD", "CBD", "CBD", "CBD", "CBD", "CBD", "CBD", "CBD", 
"CBD", "CBD", "CBD", "CBD", "CBD"), BU = c("OAO", "Constr", "Retail", 
"OAO", "Constr", "Retail", "OAO", "Constr", "Retail", "OAO", 
"Constr", "Retail", "OAO", "Constr", "Retail", "OAO", "Constr", 
"Retail", "OAO", "Constr"), CC = c("AUDIT", "AUDIT", "AUDIT", 
"AUDIT", "AUDIT", "AUDIT", "CORC", "CORC", "CORC", "CORC", "CORC", 
"CORC", "CORC", "CORC", "CORC", "CORC", "CORC", "CORC", "CORC", 
"CORC"), CA_LVL = c("AUDIT01", "AUDIT01", "AUDIT01", "AUDIT02", 
"AUDIT02", "AUDIT02", "CORC01", "CORC01", "CORC01", "CORC02", 
"CORC02", "CORC02", "CORC03", "CORC03", "CORC03", "CORC04", "CORC04", 
"CORC04", "CORC05", "CORC05"), Score = c(1, 1, 2, 1, 3, 3, 1, 
3, 2, 2, 4, 2, 2, 3, 1, 4, 2, 3, 3, 2)), .Names = c("file_src", 
"AU", "BU", "CC", "CA_LVL", "Score"), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))

Defintions 其中AU是一组五（5）个＆＃39;组。 BU是一组五十五（55）个单元，全部属于五个AU中的一个。亲子关系。分数是0-4的原始数字。 Control_Category是一个变量，其中有六个（字符串值）。

目前，我的代码被分解，以便脚本执行两个级别的分组和聚合分数以提供简单的平均值。我首先在AU级别进行分组，以获得特定组别（CC）的所有单元的简单平均值。最后，我有五个data.frames（cbg.au.stat.wide，cbd.au.stat.wide等）。这些dfs代表给定组中所有单元的给定类别的平均分数。

# Group1 assessment unit scores
cbg.au.stat.wide <- df %>%
  group_by(AU, CC) %>%
  filter(AU == "CBG") %>%
  summarise(avg = mean(Score, na.rm = TRUE)) %>%
  dcast(AU ~ CC, value.var = "avg") %>%
  print()  # end chain

产生：

cbg.au.stat.wide
   AU AUDIT     CORC GOV      PPS     TMSC    TRAIN
1 CBG     3 2.733333 2.2 2.666667 1.583333 2.666667

之后，所有的＆＃AU; AU级＆＃39;使用dplyr :: bind_rows

组合数据帧

au.avg.scores <- bind_rows(
  bsa.au.stat.wide,bsg.au.stat.wide,cbd.au.stat.wide,
  cbg.au.stat.wide,wmg.au.stat.wide)

au.avg.scores
         AU    AUDIT     CORC      GOV      PPS     TMSC    TRAIN
1 BSA Admin 2.833333 2.000000 2.733333 2.000000 1.750000 2.333333
2       BSG 2.833333 0.000000 2.733333 2.000000 1.750000 2.333333
3       CBD 1.833333 2.533333 2.466667 2.000000 2.500000 2.166667
4       CBG 3.000000 2.733333 2.200000 2.666667 1.583333 2.666667
5       WMG 2.625000 1.816667 2.533333 2.166667 1.895833 2.375000

然后我执行类似的分组和总结活动。只有这一次而不是在AU级别（父级），我在BU级别为每个类别（CC）执行此操作。那么，对于给定的AU，我知道他们的控制类别中有一个BU平均分数表。

# Group1 business units by Control Category
cbg.bu.stat.wide <- df %>%
  group_by(BU, CC) %>%
  filter(AU == "CBG") %>%
  summarise(avg = mean(Score, na.rm = TRUE)) %>%
  dcast(BU ~ CC, value.var = "avg") %>%
  print() # end chain

产生：

                      BU AUDIT CORC GOV PPS TMSC TRAIN
1        Capital Markets     3  3.2 1.6   4 1.00     3
2                    EFD     4  2.6 1.6   3 1.75     3
3 Global Trade Solutions     3  2.4 3.4   1 2.00     2
4         Investigations     1   NA  NA  NA   NA    NA

我认为你在＆＃39; BU＆＃39;等级是＆＃39; AU＆＃39;水平已经下降。最后，我想将所有这些BU组合成一个大表，显示BU和来自它的AU

所以它最终看起来像这样：

> bu.avg.scores
AU BU AUDIT CORC GOV PPS TMSC TRAIN
CBG Adherence   3.0  1.4 3.2   1 1.50   3.0
CBG CTR   2.0  2.8 2.0   4 1.50   2.5
CBG HRCU   3.5  1.8 3.0   1 2.25   1.5
CBD Investigations   2.0   NA  NA  NA   NA    NA
BSG ACH   2.0  0.0 2.0   4 1.50   2.5

Answer 1

更新答案

这是基于评论主题的更新答案。我们分别按AU和BU进行汇总，并将结果存储在列表中。然后，我们将展示如何将摘要组合到单个数据框中，并将摘要输出为表格。

library(tidyverse)

# Summarize by AU and (separately) by BU and store each summary in a list
dfs = list(AU = df %>% 
             group_by(AU, CC) %>% 
             summarise(avg=mean(Score, na.rm=TRUE)),
           BU = df %>% 
             group_by(BU, CC) %>% 
             summarise(avg=mean(Score, na.rm=TRUE)))

现在，每个摘要都存储在单独的列表元素中。这使得两个不同级别的摘要保持分离，但存储在一个对象中，因此很容易进一步处理。

dfs

$AU
     AU    CC      avg
1   CBD AUDIT 1.833333
2   CBD  CORC 2.428571

$BU
      BU    CC   avg
1 Constr AUDIT   2.0
2 Constr  CORC   2.8
3    OAO AUDIT   1.0
4    OAO  CORC   2.4
5 Retail AUDIT   2.5
6 Retail  CORC   2.0

如果您需要单个数据框，则可以执行以下操作：

# Combine into a single table and spread
df.table = bind_rows(dfs, .id="Unit Level") %>% 
  replace(., is.na(.), "") %>%  # To avoid "NA" values when we "unite" below
  unite(Unit, AU, BU, sep="") %>% 
  spread(CC, avg)

df.table

  `Unit Level`   Unit    AUDIT     CORC
1           AU    CBD 1.833333 2.428571
2           BU Constr 2.000000 2.800000
3           BU    OAO 1.000000 2.400000
4           BU Retail 2.500000 2.000000

如果您要在rmarkdown中创建报告，则可以将其转换为输出表格。这是我们删除重复行标识符的示例：

```{r}
knitr::kable(df.table %>% 
               mutate(`Unit Level` = replace(`Unit Level`, duplicated(`Unit Level`), "")))
```

这是在PDF文件中输出时表格的样子：

或者，如果您想添加一个中间行以分隔AU和BU平均值，您可以这样做：

```{r, results="asis"}
library(xtable)
options(xtable.include.rownames=FALSE, xtable.comment=FALSE)

print(xtable(df.table %>% 
               mutate(`Unit Level` = replace(`Unit Level`, duplicated(`Unit Level`), ""))),
             hline.after=c(-1,0,cumsum(table(df.table["Unit Level"]))))
```

原始答案

在下面的代码中，我们首先计算AU和BU级别的平均值。然后我们计算AU级别的平均值，并使用bind_rows来合并两个级别的平均值。然后我们可以spread将结果数据框格式化为宽格式。

library(tidyverse)

# Get averages at the AU-BU level
dfs = df %>%
  group_by(AU, BU, CC) %>%
  summarise(avg = mean(Score, na.rm = TRUE)) 

dfs

     AU     BU    CC     n   avg
1   CBD Constr AUDIT     2   2.0
2   CBD Constr  CORC     5   2.8
3   CBD    OAO AUDIT     2   1.0
4   CBD    OAO  CORC     5   2.4
5   CBD Retail AUDIT     2   2.5
6   CBD Retail  CORC     4   2.0

# Combine with averages at the AU level
dfs = bind_rows(dfs, 
                df %>%
                  group_by(AU, CC) %>%
                  summarise(avg = mean(Score, na.rm = TRUE)) %>% 
                  mutate(BU = paste("All", AU,"BU")))

dfs

     AU         BU    CC      avg
1   CBD     Constr AUDIT 2.000000
2   CBD     Constr  CORC 2.800000
3   CBD        OAO AUDIT 1.000000
4   CBD        OAO  CORC 2.400000
5   CBD     Retail AUDIT 2.500000
6   CBD     Retail  CORC 2.000000
7   CBD All CBD BU AUDIT 1.833333
8   CBD All CBD BU  CORC 2.428571

# Spread (does same thing as dcast, but using tidyr spread function)
dfs %>% spread(CC, avg)

     AU         BU    AUDIT     CORC
1   CBD All CBD BU 1.833333 2.428571
2   CBD     Constr 2.000000 2.800000
3   CBD        OAO 1.000000 2.400000
4   CBD     Retail 2.500000 2.000000

这可以组合成一个链：

dfs = df %>%
  group_by(AU, BU, CC) %>%
  summarise(avg = mean(Score, na.rm = TRUE)) %>% 
  bind_rows(
    df %>%
      group_by(AU, CC) %>%
      summarise(avg = mean(Score, na.rm = TRUE)) %>% 
      mutate(BU = paste("All", AU,"BU"))
  ) %>% 
  spread(CC, avg)

绑定来自不同数据帧的行时如何维护原始行名？

1 个答案:

更新答案

原始答案