我正在努力创建一个新变量,以指明id,LET
中的哪些字母grp
,某些群组id
开头。
以下我将说明我的问题。我有这样的数据,
library(dplyr); library(tidyr)
df <- tibble(id = rep(0:1, c(7, 10)),
grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)))
#> # A tibble: 17 x 3
#> id grp LET
#> <int> <dbl> <chr>
#> 1 0 0 A
#> 2 0 0 B
#> 3 0 0 B
#> 4 0 1 B
#> 5 0 1 B
#> 6 0 1 A
#> 7 0 1 A
#> 8 1 0 A
#> 9 1 0 B
#> 10 1 1 B
#> 11 1 1 B
#> 12 1 1 A
#> 13 1 1 A
#> 14 1 1 A
#> 15 1 2 A
#> 16 1 2 B
#> 17 1 2 B
我现在想要创建一个新变量%>% group_by(id, grp)
,我想我可以用fill()
和mutate(grp_LET = …
填充它,就像这样;
df %>% group_by(id, grp) %>% fill(LET) %>% mutate(grp_LET = factor)
但我无法弄明白。我希望获得的是这样的。我期望的结果,
dfd <- tibble(id = rep(0:1, c(7, 10)),
grp = rep(c(0,1,0,1,2), c(3,4,2,5,3)),
LET = rep(c('A', 'B', 'A', 'B', 'A', 'B'), c(1,4, 3, 3, 4, 2)),
grp_LET = rep(c('A', 'B', 'A', 'B', 'A'), c(3, 4, 2, 5, 3)));dfd
#> # A tibble: 17 x 4
#> id grp LET grp_LET
#> <int> <dbl> <chr> <chr>
#> 1 0 0 A A
#> 2 0 0 B A
#> 3 0 0 B A
#> 4 0 1 B B
#> 5 0 1 B B
#> 6 0 1 A B
#> 7 0 1 A B
#> 8 1 0 A A
#> 9 1 0 B A
#> 10 1 1 B B
#> 11 1 1 B B
#> 12 1 1 A B
#> 13 1 1 A B
#> 14 1 1 A B
#> 15 1 2 A A
#> 16 1 2 B A
#> 17 1 2 B A
对此有任何帮助将不胜感激。
tbl <- df
dim(tbl)
#> [1] 17 3
# install.packages(c("dplyr"), dependencies = TRUE)
library(dplyr)
R_base_by_lmo <- function(x) {dat <- x
dat$grp_LET <- ave(dat$LET, dat[c("id", "grp")],
FUN=function(x) head(x, 1)); as_tibble(dat)
}
# mapply(all.equal, R_base_by_lmo(tbl), dfd)
# install.packages(c("data.table"), dependencies = TRUE)
library(data.table)
dt_by_akrun <- function(x) {foo <- copy(x)
setDT(foo)[, grp_LET := LET[1], .(id, grp)]
as_tibble(foo)
}
# mapply(all.equal, dt_by_akrun(tbl), dfd)
tidyverse_by_Psidom <- function(x) x %>% group_by(id,grp) %>% mutate(grp_LET=first(LET))
# mapply(all.equal, tidyverse_by_Psidom(df), dfd)
# install.packages(c("microbenchmark"), dependencies = TRUE)
require(microbenchmark)
x <- tbl
res <- microbenchmark(R_base_by_lmo(x),
dt_by_akrun(x),
tidyverse_by_Psidom(x), times = 67)
## Print results:
print(res)
Unit: milliseconds
expr min lq mean median uq max neval cld
R_base_by_lmo(x) 1.338758 1.419860 1.620292 1.547867 1.640043 4.098088 67 a
dt_by_akrun(x) 1.670019 1.776765 2.123219 1.859477 1.972842 11.922270 67 a
tidyverse_by_Psidom(x) 3.964432 4.065466 4.718041 4.128942 4.478950 15.939186 67 b
### Plot results:
boxplot(res)
dim('my production-data')
#> [1] 46104 11
x <- 'my production-data'
res2 <- microbenchmark(R_base_by_lmo(x),
dt_by_akrun(x),
tidyverse_by_Psidom(x), times = 8)
print(res2)
Unit: milliseconds
expr min lq mean median uq max neval cld
R_base_by_lmo(x) 28976.46868 29236.19450 29468.63955 29464.51339 29591.25206 30188.72785 8 b
dt_by_akrun(x) 74.18023 76.69274 85.75983 87.15791 91.62508 100.94692 8 a
tidyverse_by_Psidom(x) 38.38051 41.15552 42.83667 41.92207 44.53830 49.08109 8 a
答案 0 :(得分:4)
在基数R中,只需使用ave
和head
:
dat$grp_let <- ave(dat$LET, dat[c("id", "grp")], FUN=function(x) head(x, 1))
返回
dat
id grp LET grp_let
1 0 0 A A
2 0 0 B A
3 0 0 B A
4 0 1 B B
5 0 1 B B
6 0 1 A B
7 0 1 A B
8 1 0 A A
9 1 0 B A
10 1 1 B B
11 1 1 B B
12 1 1 A B
13 1 1 A B
14 1 1 A B
15 1 2 A A
16 1 2 B A
17 1 2 B A
答案 1 :(得分:3)
似乎每个组都需要第一个 LET;您可以从每个组的vector LET 中提取first
元素,mutate
将在组内广播/循环值:
df %>% group_by(id, grp) %>% mutate(grp_LET = first(LET))
# A tibble: 17 x 4
# Groups: id, grp [5]
# id grp LET grp_LET
# <int> <dbl> <chr> <chr>
# 1 0 0 A A
# 2 0 0 B A
# 3 0 0 B A
# 4 0 1 B B
# 5 0 1 B B
# 6 0 1 A B
# 7 0 1 A B
# 8 1 0 A A
# 9 1 0 B A
#10 1 1 B B
#11 1 1 B B
#12 1 1 A B
#13 1 1 A B
#14 1 1 A B
#15 1 2 A A
#16 1 2 B A
#17 1 2 B A
答案 2 :(得分:3)
或者我们可以使用library(data.table)
setDT(df)[, grp_LET := LET[1], .(id, grp)]
df
# id grp LET grp_LET
# 1: 0 0 A A
# 2: 0 0 B A
# 3: 0 0 B A
# 4: 0 1 B B
# 5: 0 1 B B
# 6: 0 1 A B
# 7: 0 1 A B
# 8: 1 0 A A
# 9: 1 0 B A
#10: 1 1 B B
#11: 1 1 B B
#12: 1 1 A B
#13: 1 1 A B
#14: 1 1 A B
#15: 1 2 A A
#16: 1 2 B A
#17: 1 2 B A
{{1}}