使用dplyr传播摘要统计信息

时间:2019-08-26 11:39:10

标签: r dplyr

我想用统计列Innovation分散第一列min max mean sd

这是我开发的代码:

plot1 <- BANKRUPTCY2 %>%
select(Innovation,DelTradMerge,Depth,Scope,Novelty,PatW,Assets,Age,Sales,Income,NasdaqIndex) %>% 
      group_by(Innovation,DelTradMerge) %>% 
      rename_all(. %>% gsub("_", ".", .)) %>%
      summarise_each(funs(min = min(.,na.rm = T),max = max(.,na.rm = T),mean = mean(.,na.rm = T),sd = sd(.,na.rm = T))) %>% 
      mutate_if(is.numeric, ~round(., 1)) %>% 
      gather(stat, val,-Innovation,-DelTradMerge) %>%
      separate(stat, into = c("Variable", "stat"), sep = "_") %>%
      spread(stat,val) %>%
      unite(Innovation, Innovation, DelTradMerge, sep = '_')

# A tibble: 54 x 6
   Innovation   Variable      max  mean   min    sd
   <chr>        <chr>       <dbl> <dbl> <dbl> <dbl>
 1 Innovators_1 Age           3.5   0.2   0     0.7
 2 Innovators_1 Assets       10.7   4.8   1     1.3
 3 Innovators_1 Depth         3.5   0.4   0     0.6
 4 Innovators_1 Income        7.1   1.3  -6.1   1.3

当我使用此spread(Innovation,-Variable)时,只有min列与Innovation列一起散布,如下所示:

> plot1
# A tibble: 41 x 10
   Variable   max  mean    sd Innovators_1 Innovators_2 Innovators_3 NonInnovators_1 NonInnovators_2 NonInnovators_3
   <chr>    <dbl> <dbl> <dbl>        <dbl>        <dbl>        <dbl>           <dbl>           <dbl>           <dbl>
 1 Age        2.6   0.2   0.5           NA           NA         NA               0                NA            NA  
 2 Age        2.8   0.1   0.4           NA            0         NA              NA                 0            NA  
 3 Age        3.3   0.2   0.8           NA           NA          0              NA                NA            NA  
 4 Age        3.3   0.3   0.6           NA           NA         NA              NA                NA             0  
 5 Age        3.5   0.2   0.7            0           NA         NA              NA                NA            NA  

我也尝试了gather(stat, Innovation, -Variable),但是它不起作用。

如何同时分散其他统计信息列max mean sd

这是我的数据:

structure(list(Innovation = c("Innovators_1", "Innovators_1", 
"Innovators_1", "Innovators_1", "Innovators_1", "Innovators_1", 
"Innovators_1", "Innovators_1", "Innovators_1", "Innovators_2", 
"Innovators_2", "Innovators_2", "Innovators_2", "Innovators_2", 
"Innovators_2", "Innovators_2", "Innovators_2", "Innovators_2", 
"Innovators_3", "Innovators_3", "Innovators_3", "Innovators_3", 
"Innovators_3", "Innovators_3", "Innovators_3", "Innovators_3", 
"Innovators_3", "NonInnovators_1", "NonInnovators_1", "NonInnovators_1", 
"NonInnovators_1", "NonInnovators_1", "NonInnovators_1", "NonInnovators_1", 
"NonInnovators_1", "NonInnovators_1", "NonInnovators_2", "NonInnovators_2", 
"NonInnovators_2", "NonInnovators_2", "NonInnovators_2", "NonInnovators_2", 
"NonInnovators_2", "NonInnovators_2", "NonInnovators_2", "NonInnovators_3", 
"NonInnovators_3", "NonInnovators_3", "NonInnovators_3", "NonInnovators_3", 
"NonInnovators_3", "NonInnovators_3", "NonInnovators_3", "NonInnovators_3"
), Variable = c("Age", "Assets", "Depth", "Income", "NasdaqIndex", 
"Novelty", "PatW", "Sales", "Scope", "Age", "Assets", "Depth", 
"Income", "NasdaqIndex", "Novelty", "PatW", "Sales", "Scope", 
"Age", "Assets", "Depth", "Income", "NasdaqIndex", "Novelty", 
"PatW", "Sales", "Scope", "Age", "Assets", "Depth", "Income", 
"NasdaqIndex", "Novelty", "PatW", "Sales", "Scope", "Age", "Assets", 
"Depth", "Income", "NasdaqIndex", "Novelty", "PatW", "Sales", 
"Scope", "Age", "Assets", "Depth", "Income", "NasdaqIndex", "Novelty", 
"PatW", "Sales", "Scope"), max = c(3.5, 10.7, 3.5, 7.1, 8, 0.7, 
7.3, 9, 0.7, 2.8, 10.8, 1.5, 7.1, 8, 0.7, 5.6, 10.4, 0.7, 3.3, 
10.1, 3, 6.5, 8, 0.7, 6.2, 8.5, 0.7, 2.6, 8.9, 0, 4.4, 8, 0, 
0, 8.9, 0, 2.8, 7.9, 0, 3.4, 7.6, 0, 0, 7.7, 0, 3.3, 9.7, 0, 
6.3, 8, 0, 0, 9, 0), mean = c(0.2, 4.8, 0.4, 1.3, 6.5, 0.2, 1.5, 
3.5, 0.5, 0.1, 4.2, 0.2, 0.9, 6.4, 0.3, 1.4, 3.1, 0.6, 0.2, 5.1, 
0.6, 1.7, 6.6, 0.2, 1.9, 3.8, 0.5, 0.2, 3.8, 0, 0.6, 6.4, 0, 
0, 2.6, 0, 0.1, 2.8, 0, -0.1, 6.3, 0, 0, 1.8, 0, 0.3, 4, 0, 0.8, 
6.5, 0, 0, 2.9, 0), min = c(0, 1, 0, -6.1, 5.7, 0, 0, 0, 0, 0, 
0.5, 0, -4.7, 5.7, 0, 0, 0, 0.1, 0, 0.8, 0, -6.4, 5.7, 0, 0, 
0.1, 0, 0, 0.6, 0, -6.3, 5.7, 0, 0, 0, 0, 0, 0, 0, -6.3, 5.5, 
0, 0, 0, 0, 0, 0.6, 0, -2.8, 5.7, 0, 0, 0, 0), sd = c(0.7, 1.3, 
0.6, 1.3, 0.5, 0.2, 1.2, 1.4, 0.2, 0.4, 2.2, 0.3, 1.8, 0.5, 0.2, 
1.3, 2.2, 0.2, 0.8, 1.5, 0.6, 1.7, 0.5, 0.2, 1.3, 1.5, 0.2, 0.5, 
1.2, 0, 1, 0.5, 0, 0, 1.2, 0, 0.4, 1.4, 0, 1.1, 0.5, 0, 0, 1.4, 
0, 0.6, 1.6, 0, 1.3, 0.5, 0, 0, 1.6, 0)), .Names = c("Innovation", 
"Variable", "max", "mean", "min", "sd"), class = c("tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -54L))

1 个答案:

答案 0 :(得分:3)

我找到了此链接https://community.rstudio.com/t/spread-with-multiple-value-columns/5378/2 那里有人用他的功能回答了类似的问题。

myspread <- function(df, key, value) {
  # quote key
  keyq <- rlang::enquo(key)
  # break value vector into quotes
  valueq <- rlang::enquo(value)
  s <- rlang::quos(!!valueq)
  df %>% gather(variable, value, !!!s) %>%
    unite(temp, !!keyq, variable) %>%
    spread(temp, value)
}

df2 <- df %>% myspread(Innovation, c(max, mean, min, sd))

哪个给出这个结果。

> df2
# A tibble: 27 x 10
   DelTradMerge Variable Innovators_max Innovators_mean Innovators_min Innovators_sd NonInnovators_m… NonInnovators_m…
          <dbl> <chr>             <dbl>           <dbl>          <dbl>         <dbl>            <dbl>            <dbl>
 1            1 Age                 3.5             0.2            0             0.7              2.6              0.2
 2            1 Assets             10.7             4.8            1             1.3              8.9              3.8
 3            1 Depth               3.5             0.4            0             0.6              0                0  
 4            1 Income              7.1             1.3           -6.1           1.3              4.4              0.6
 5            1 NasdaqI…            8               6.5            5.7           0.5              8                6.4
 6            1 Novelty             0.7             0.2            0             0.2              0                0  
 7            1 PatW                7.3             1.5            0             1.2              0                0  
 8            1 Sales               9               3.5            0             1.4              8.9              2.6
 9            1 Scope               0.7             0.5            0             0.2              0                0  
10            2 Age                 2.8             0.1            0             0.4              2.8              0.1
# … with 17 more rows, and 2 more variables: NonInnovators_min <dbl>, NonInnovators_sd <dbl>