tidyr; %>%group_by()mutate(foo = fill())

时间:2017-10-25 14:04:29

标签: r tidyr tidyverse mutate

我正在努力创建一个新变量,以指明id,LET中的哪些字母grp,某些群组id开头。

以下我将说明我的问题。我有这样的数据,

library(dplyr); library(tidyr)
df <- tibble(id = rep(0:1, c(7, 10)),
             grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
             LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)))
#> # A tibble: 17 x 3
#>       id   grp   LET
#>    <int> <dbl> <chr>
#>  1     0     0     A
#>  2     0     0     B
#>  3     0     0     B
#>  4     0     1     B
#>  5     0     1     B
#>  6     0     1     A
#>  7     0     1     A
#>  8     1     0     A
#>  9     1     0     B
#> 10     1     1     B
#> 11     1     1     B
#> 12     1     1     A
#> 13     1     1     A
#> 14     1     1     A
#> 15     1     2     A
#> 16     1     2     B
#> 17     1     2     B

我现在想要创建一个新变量%>% group_by(id, grp),我想我可以用fill()mutate(grp_LET = …填充它,就像这样;

df %>% group_by(id, grp) %>% fill(LET) %>% mutate(grp_LET = factor)

但我无法弄明白。我希望获得的是这样的。我期望的结果,

dfd <- tibble(id = rep(0:1, c(7, 10)),
             grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
             LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)),
             grp_LET =  rep(c('A', 'B', 'A', 'B', 'A'), c(3, 4, 2, 5, 3)));dfd
#> # A tibble: 17 x 4
#>       id   grp   LET grp_LET
#>    <int> <dbl> <chr>   <chr>
#>  1     0     0     A       A
#>  2     0     0     B       A
#>  3     0     0     B       A
#>  4     0     1     B       B
#>  5     0     1     B       B
#>  6     0     1     A       B
#>  7     0     1     A       B
#>  8     1     0     A       A
#>  9     1     0     B       A
#> 10     1     1     B       B
#> 11     1     1     B       B
#> 12     1     1     A       B
#> 13     1     1     A       B
#> 14     1     1     A       B
#> 15     1     2     A       A
#> 16     1     2     B       A
#> 17     1     2     B       A

对此有任何帮助将不胜感激。

更新2017-10-26 11:03:40Z,我做了 microbenchmark 比较三个答案你的享受,

tbl <- df
dim(tbl)
#> [1] 17  3

# install.packages(c("dplyr"), dependencies = TRUE)
library(dplyr)

R_base_by_lmo <- function(x) {dat <- x
             dat$grp_LET <- ave(dat$LET, dat[c("id", "grp")],
             FUN=function(x) head(x, 1)); as_tibble(dat) 
          }
# mapply(all.equal, R_base_by_lmo(tbl), dfd)

# install.packages(c("data.table"), dependencies = TRUE)
library(data.table) 
dt_by_akrun  <- function(x) {foo <- copy(x)
                  setDT(foo)[, grp_LET := LET[1], .(id, grp)]
                   as_tibble(foo)
               }
# mapply(all.equal, dt_by_akrun(tbl), dfd)
tidyverse_by_Psidom <- function(x) x %>% group_by(id,grp) %>% mutate(grp_LET=first(LET))
# mapply(all.equal, tidyverse_by_Psidom(df), dfd)


# install.packages(c("microbenchmark"), dependencies = TRUE)
require(microbenchmark)

x <- tbl
res <- microbenchmark(R_base_by_lmo(x),
                      dt_by_akrun(x),
                      tidyverse_by_Psidom(x), times = 67)

## Print results:
print(res)
Unit: milliseconds                                                              
                   expr      min       lq     mean   median       uq       max neval cld
       R_base_by_lmo(x) 1.338758 1.419860 1.620292 1.547867 1.640043  4.098088    67   a 
         dt_by_akrun(x) 1.670019 1.776765 2.123219 1.859477 1.972842 11.922270    67   a 
 tidyverse_by_Psidom(x) 3.964432 4.065466 4.718041 4.128942 4.478950 15.939186    67   b

### Plot results:
boxplot(res)

com_plot

更新2017-10-26 12:10:55Z,鼓励akrun’s comment我在生产数据上重新进行微基准测试。哪个超过尊重结果。

dim('my production-data')
#> [1] 46104    11
x <- 'my production-data'
res2 <- microbenchmark(R_base_by_lmo(x),
                      dt_by_akrun(x),
                      tidyverse_by_Psidom(x), times = 8)
print(res2)
Unit: milliseconds
                   expr         min          lq        mean      median          uq         max neval cld
       R_base_by_lmo(x) 28976.46868 29236.19450 29468.63955 29464.51339 29591.25206 30188.72785     8   b
         dt_by_akrun(x)    74.18023    76.69274    85.75983    87.15791    91.62508   100.94692     8   a 
 tidyverse_by_Psidom(x)    38.38051    41.15552    42.83667    41.92207    44.53830    49.08109     8   a 

箱线图(RES2) pro_data

3 个答案:

答案 0 :(得分:4)

在基数R中,只需使用avehead

dat$grp_let <- ave(dat$LET, dat[c("id", "grp")], FUN=function(x) head(x, 1))

返回

dat
   id grp LET grp_let
1   0   0   A       A
2   0   0   B       A
3   0   0   B       A
4   0   1   B       B
5   0   1   B       B
6   0   1   A       B
7   0   1   A       B
8   1   0   A       A
9   1   0   B       A
10  1   1   B       B
11  1   1   B       B
12  1   1   A       B
13  1   1   A       B
14  1   1   A       B
15  1   2   A       A
16  1   2   B       A
17  1   2   B       A

答案 1 :(得分:3)

似乎每个组都需要第一个 LET;您可以从每个组的vector LET 中提取first元素,mutate将在组内广播/循环值:

df %>% group_by(id, grp) %>% mutate(grp_LET = first(LET))

# A tibble: 17 x 4
# Groups:   id, grp [5]
#      id   grp   LET grp_LET
#   <int> <dbl> <chr>   <chr>
# 1     0     0     A       A
# 2     0     0     B       A
# 3     0     0     B       A
# 4     0     1     B       B
# 5     0     1     B       B
# 6     0     1     A       B
# 7     0     1     A       B
# 8     1     0     A       A
# 9     1     0     B       A
#10     1     1     B       B
#11     1     1     B       B
#12     1     1     A       B
#13     1     1     A       B
#14     1     1     A       B
#15     1     2     A       A
#16     1     2     B       A
#17     1     2     B       A

答案 2 :(得分:3)

或者我们可以使用library(data.table) setDT(df)[, grp_LET := LET[1], .(id, grp)] df # id grp LET grp_LET # 1: 0 0 A A # 2: 0 0 B A # 3: 0 0 B A # 4: 0 1 B B # 5: 0 1 B B # 6: 0 1 A B # 7: 0 1 A B # 8: 1 0 A A # 9: 1 0 B A #10: 1 1 B B #11: 1 1 B B #12: 1 1 A B #13: 1 1 A B #14: 1 1 A B #15: 1 2 A A #16: 1 2 B A #17: 1 2 B A

{{1}}