R中组之间的平均差异

时间:2018-12-03 11:36:27

标签: r

对于示例数据框:

df1 <- structure(list(name = c("a", "b", "c", "d", "e", "f", "g", "h", 
"i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", 
"v", "w", "x", "y", "z", "a", "b", "c", "d", "e", "f", "g", "h", 
"i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", 
"v", "w", "x", "y", "z", "a", "b", "c", "d", "e", "f", "g", "h", 
"i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", 
"v", "w", "x", "y", "z"), amount = c(5.5, 5.4, 5.2, 5.3, 5.1, 
5.1, 5, 5, 4.9, 4.5, 6, 5.9, 5.7, 5.4, 5.3, 5.1, 5.6, 5.4, 5.3, 
5.6, 4.6, 4.2, 4.5, 4.2, 4, 3.8, 6, 5.8, 5.7, 5.6, 5.3, 5.6, 
5.4, 5.5, 5.4, 5.1, 9, 8.8, 8.6, 8.4, 8.2, 8, 7.8, 7.6, 7.4, 
7.2, 6, 5.75, 5.5, 5.25, 5, 4.75, 10, 8.9, 7.8, 6.7, 5.6, 4.5, 
3.4, 2.3, 1.2, 0.1, 6, 5.8, 5.7, 5.6, 5.5, 5.5, 5.4, 5.6, 5.8, 
5.1, 6, 5.5, 5.4, 5.3, 5.2, 5.1), decile = c(1L, 2L, 3L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L), time = c(2016L, 
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 
2016L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L)), .Names = c("name", "amount", 
"decile", "time"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-78L), spec = structure(list(cols = structure(list(name = structure(list(), class = c("collector_character", 
"collector")), amount = structure(list(), class = c("collector_double", 
"collector")), decile = structure(list(), class = c("collector_integer", 
"collector")), time = structure(list(), class = c("collector_integer", 
"collector"))), .Names = c("name", "amount", "decile", "time"
)), default = structure(list(), class = c("collector_guess", 
"collector"))), .Names = c("cols", "default"), class = "col_spec"))

我想生成一个汇总表,详细列出十分位组1和2之间的平均差异(即十分位1的平均结果减去十分位2组的平均结果),2和3、3和4、4和5、5和每年分别为6、6、7、7、8、8、9、9和10。

有人有什么建议吗?

2 个答案:

答案 0 :(得分:5)

使用df['a'] = df['a'].str.replace('[^a-zA-Z]+', '') print (df) a 0 apple 1 orangemg

dplyr

答案 1 :(得分:1)

您也可以这样做:

library(tidyverse)

for (i in 1:9) {

  df1 <- df1 %>% 
    group_by(time) %>%
    mutate_(
      .dots = setNames(list(
        paste0("mean(amount[decile ==", i, "], na.rm = TRUE) - mean(amount[decile == ", i ,"+ 1], na.rm = TRUE)")), 
        paste0("mean_", i, "_", i + 1))
    )

}

输出为:

# A tibble: 78 x 13
# Groups:   time [3]
   name  amount decile  time mean_1_2 mean_2_3 mean_3_4 mean_4_5 mean_5_6 mean_6_7 mean_7_8 mean_8_9 mean_9_10
   <chr>  <dbl>  <int> <int>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>     <dbl>
 1 a        5.5      1  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 2 b        5.4      2  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 3 c        5.2      3  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 4 d        5.3      4  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 5 e        5.1      5  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 6 f        5.1      6  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 7 g        5        7  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 8 h        5        8  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 9 i        4.9      9  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
10 j        4.5     10  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
# ... with 68 more rows

每年您都可以获取完整的摘要,如下所示:

df1 <- df1 %>% ungroup() %>%
  select(time, starts_with("mean")) %>%
  distinct()

输出:

# A tibble: 3 x 10
   time mean_1_2 mean_2_3 mean_3_4 mean_4_5 mean_5_6 mean_6_7 mean_7_8 mean_8_9 mean_9_10
  <int>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>     <dbl>
1  2016   0.1000   0.2       0.1      0.15    0.1000   -0.2     0.1000    0.1      0.0500
2  2017   0.263    0.0625    0.213    0.237   0.0875   -1.06    0.0500    0.150    0.25  
3  2018   0.600    0.433     0.433    0.433   0.4       0.633   0.45      0.450    0.9