使用(d)plyr创建一系列新列

时间:2015-02-08 16:54:20

标签: r data.table plyr dplyr

比方说我有一个包含a列的数据框,我想为a^i的多个值创建列i

> dat <- data.frame(a=1:5)
> dat
    a
1   1
2   2
3   3
4   4
5   5

例如,我想要i=2:5的输出:

  a power_2 power_3 power_4 power_5
1 1       1       1       1       1
2 2       4       8      16      32
3 3       9      27      81     243
4 4      16      64     256    1024
5 5      25     125     625    3125

目前我使用data.table获取此输出,如下所示:

DT <- data.table(dat)
exponents <- 2:5
DT[, paste0("power_",exponents):=lapply(exponents, function(p) a^p)]

如何使用plyr / dplyr?当然,我可以通过为每个power_i=a^i键入i来执行以下操作,但这不是我想要的。

mutate(dat, power_2=a^2, power_3=a^3, ...)

答案后的结论

已经提出了几个答案,并且已经通过@docendo discimus进行了比较。我只是添加了与data.table的比较。

library(data.table)
library(dplyr)
set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))
DT <-  data.table(dat)

library(microbenchmark)

microbenchmark(
  data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  times = 30,
  unit = "relative"
)
Unit: relative
       expr       min        lq      mean    median        uq       max neval cld
 data.table  1.022945  1.039674  1.108558  1.026319  1.083644  2.370180    30  a 
     Henrik  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    30  a 
      dd.do  1.149195  1.160735  1.167672  1.158141  1.150280  1.268279    30  a 
      dd.bc 14.350034 13.982658 13.737964 13.632361 13.606221 15.866711    30   b

使用两个base解决方案更新基准,Henrik2和josh(来自他的评论),这是最快的:

set.seed(2015)
dat <- data.frame(a = sample(1000))

microbenchmark(
  data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)),  `^`)),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
  times = 30,
  unit = "relative"
)

# Unit: relative
#       expr       min        lq      mean    median        uq       max neval  cld
# data.table  1.991613  2.029778  1.982169  1.990417  1.946677  1.694030    30  bc 
#     Henrik  2.026345  2.017179  1.996419  2.003189  2.030176  1.733583    30  bc 
#    Henrik2  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    30 a   
#      dd.do  2.356886  2.375713  2.322452  2.348053  2.304826  2.101494    30   c 
#      dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116    30    d
#       josh  1.725750  1.699887  1.641290  1.625331  1.637823  1.330598    30  b

3 个答案:

答案 0 :(得分:7)

一种可能性是在outer中使用do,然后使用setNames

设置名称
i <- 2:5
dat %>%
  do(data.frame(., outer(.$a, i, `^`))) %>%
  setNames(., c("a", paste0("power_", i)))

#   a power_2 power_3 power_4 power_5
# 1 1       1       1       1       1
# 2 2       4       8      16      32
# 3 3       9      27      81     243
# 4 4      16      64     256    1024
# 5 5      25     125     625    3125

如果您将&#39;功率矢量命名为&#39; &#34; I&#34;首先,您可以拨打cbind而不是dodata.frame,我认为在这种情况下不需要dplyr个功能。

cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)),  `^`))
#   a power_2 power_3 power_4 power_5
# 1 1       1       1       1       1
# 2 2       4       8      16      32
# 3 3       9      27      81     243
# 4 4      16      64     256    1024
# 5 5      25     125     625    3125

base,非do代码对于较大的样本数据更快。我还添加了@Josh O&#39; Brien的base解决方案。

set.seed(2015)
dat <- data.frame(a = sample(1000))

microbenchmark(
  data.table = DT[, paste0("power_",i):=lapply(i, function(k) a^k)],
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  Henrik2 = cbind(dat, outer(dat$a, setNames(i, paste0("power_", i)),  `^`)),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  josh = data.frame(dat, setNames(lapply(2:5, function(X) dat$a^X), paste0("power_", 2:5))),
  times = 30,
  unit = "relative"
)

# Unit: relative
#       expr       min        lq      mean    median        uq       max neval  cld
# data.table  1.991613  2.029778  1.982169  1.990417  1.946677  1.694030    30  bc 
#     Henrik  2.026345  2.017179  1.996419  2.003189  2.030176  1.733583    30  bc 
#    Henrik2  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000    30 a   
#      dd.do  2.356886  2.375713  2.322452  2.348053  2.304826  2.101494    30   c 
#      dd.bc 37.445491 36.081298 34.791638 34.783854 34.787655 27.832116    30    d
#       josh  1.725750  1.699887  1.641290  1.625331  1.637823  1.330598    30  b

答案 1 :(得分:5)

以下是使用do的选项:

i <- 2:5
n <- c(names(dat), paste0("power_", i))
dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n)
#  a power_2 power_3 power_4 power_5
#1 1       1       1       1       1
#2 2       4       8      16      32
#3 3       9      27      81     243
#4 4      16      64     256    1024
#5 5      25     125     625    3125

另一种选择,使用bind_cols

dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n)
#  a power_2 power_3 power_4 power_5
#1 1       1       1       1       1
#2 2       4       8      16      32
#3 3       9      27      81     243
#4 4      16      64     256    1024
#5 5      25     125     625    3125

评论后编辑:

@Henrik的解决方案比我的快:

set.seed(2015)
dat <- data.frame(a = sample(1000))
i <- 2:5
n <- c(names(dat), paste0("power_", i))

library(microbenchmark)

microbenchmark(
  Henrik = dat %>% do(data.frame(., outer(.$a, i, `^`))) %>% setNames(n),
  dd.do = dat %>% do(data.frame(., sapply(i, function(x) .$a^x))) %>% setNames(n),
  dd.bc = dat %>% bind_cols(as.data.frame(lapply(i, function(x) .$a^x))) %>% setNames(n),
  times = 30,
  unit = "relative"
  )
Unit: relative
   expr       min        lq    median        uq       max neval
 Henrik  1.000000  1.000000  1.000000  1.000000  1.000000    30
  dd.do  1.138506  1.179104  1.173298  1.149581  2.660237    30
  dd.bc 18.862923 18.702178 18.058984 17.537727 16.426538    30

答案 2 :(得分:1)

可能这也有帮助

nm1 <- paste('power', 2:5, sep="_")
lst <- setNames(as.list(2:5), nm1)
dat1 <- setNames(as.data.frame(replicate(4, 1:5)),c('a', nm1) )
mutate_each_(dat1, funs(.^lst$.), nm1)
#    a power_2 power_3 power_4 power_5
#1 1       1       1       1       1
#2 2       4       8      16      32
#3 3       9      27      81     243
#4 4      16      64     256    1024
#5 5      25     125     625    3125