比方说我有一个包含a
列的数据框,我想为a^i
的多个值创建列i
。
> dat <- data.frame(a=1:5)
> dat
a
1 1
2 2
3 3
4 4
5 5
例如,我想要i=2:5
的输出:
a power_2 power_3 power_4 power_5
1 1 1 1 1 1
2 2 4 8 16 32
3 3 9 27 81 243
4 4 16 64 256 1024
5 5 25 125 625 3125
目前我使用data.table
获取此输出,如下所示:
DT <- data.table(dat)
exponents <- 2:5
DT[, paste0("power_",exponents):=lapply(exponents, function(p) a^p)]
如何使用plyr
/ dplyr
?当然,我可以通过为每个power_i=a^i
键入i
来执行以下操作,但这不是我想要的。
mutate(dat, power_2=a^2, power_3=a^3, ...)
已经提出了几个答案,并且已经通过@docendo discimus进行了比较。我只是添加了与data.table
的比较。
library(data.table)
library(dplyr)
set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))
DT <- data.table(dat)
library(microbenchmark)
microbenchmark(
data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
times = 30,
unit = "relative"
)
Unit: relative
expr min lq mean median uq max neval cld
data.table 1.022945 1.039674 1.108558 1.026319 1.083644 2.370180 30 a
Henrik 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 30 a
dd.do 1.149195 1.160735 1.167672 1.158141 1.150280 1.268279 30 a
dd.bc 14.350034 13.982658 13.737964 13.632361 13.606221 15.866711 30 b
使用两个base
解决方案更新基准,Henrik2和josh(来自他的评论),这是最快的:
set.seed(2015)
dat <- data.frame(a = sample(1000))
microbenchmark(
data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)), `^`)),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
times = 30,
unit = "relative"
)
# Unit: relative
# expr min lq mean median uq max neval cld
# data.table 1.991613 2.029778 1.982169 1.990417 1.946677 1.694030 30 bc
# Henrik 2.026345 2.017179 1.996419 2.003189 2.030176 1.733583 30 bc
# Henrik2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 30 a
# dd.do 2.356886 2.375713 2.322452 2.348053 2.304826 2.101494 30 c
# dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116 30 d
# josh 1.725750 1.699887 1.641290 1.625331 1.637823 1.330598 30 b
答案 0 :(得分:7)
一种可能性是在outer
中使用do
,然后使用setNames
i <- 2:5
dat %>%
do(data.frame(., outer(.$a, i, `^`))) %>%
setNames(., c("a", paste0("power_", i)))
# a power_2 power_3 power_4 power_5
# 1 1 1 1 1 1
# 2 2 4 8 16 32
# 3 3 9 27 81 243
# 4 4 16 64 256 1024
# 5 5 25 125 625 3125
如果您将&#39;功率矢量命名为&#39; &#34; I&#34;首先,您可以拨打cbind
而不是do
和data.frame
,我认为在这种情况下不需要dplyr
个功能。
cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)), `^`))
# a power_2 power_3 power_4 power_5
# 1 1 1 1 1 1
# 2 2 4 8 16 32
# 3 3 9 27 81 243
# 4 4 16 64 256 1024
# 5 5 25 125 625 3125
base
,非do
代码对于较大的样本数据更快。我还添加了@Josh O&#39; Brien的base
解决方案。
set.seed(2015)
dat <- data.frame(a = sample(1000))
microbenchmark(
data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)), `^`)),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
times = 30,
unit = "relative"
)
# Unit: relative
# expr min lq mean median uq max neval cld
# data.table 1.991613 2.029778 1.982169 1.990417 1.946677 1.694030 30 bc
# Henrik 2.026345 2.017179 1.996419 2.003189 2.030176 1.733583 30 bc
# Henrik2 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 30 a
# dd.do 2.356886 2.375713 2.322452 2.348053 2.304826 2.101494 30 c
# dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116 30 d
# josh 1.725750 1.699887 1.641290 1.625331 1.637823 1.330598 30 b
答案 1 :(得分:5)
以下是使用do
的选项:
i <- 2:5
n <- c(names(dat), paste0("power_", i))
dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n)
# a power_2 power_3 power_4 power_5
#1 1 1 1 1 1
#2 2 4 8 16 32
#3 3 9 27 81 243
#4 4 16 64 256 1024
#5 5 25 125 625 3125
另一种选择,使用bind_cols
:
dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n)
# a power_2 power_3 power_4 power_5
#1 1 1 1 1 1
#2 2 4 8 16 32
#3 3 9 27 81 243
#4 4 16 64 256 1024
#5 5 25 125 625 3125
评论后编辑:
@Henrik的解决方案比我的快:
set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))
library(microbenchmark)
microbenchmark(
Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
times = 30,
unit = "relative"
)
Unit: relative
expr min lq median uq max neval
Henrik 1.000000 1.000000 1.000000 1.000000 1.000000 30
dd.do 1.138506 1.179104 1.173298 1.149581 2.660237 30
dd.bc 18.862923 18.702178 18.058984 17.537727 16.426538 30
答案 2 :(得分:1)
可能这也有帮助
nm1 <- paste('power', 2:5, sep="_")
lst <- setNames(as.list(2:5), nm1)
dat1 <- setNames(as.data.frame(replicate(4, 1:5)),c('a', nm1) )
mutate_each_(dat1, funs(.^lst$.), nm1)
# a power_2 power_3 power_4 power_5
#1 1 1 1 1 1
#2 2 4 8 16 32
#3 3 9 27 81 243
#4 4 16 64 256 1024
#5 5 25 125 625 3125