我有一个返回多个值的函数。我需要在ddply
中使用它,但我想避免多次调用该函数。这是一个模拟示例:
library(plyr)
ff = function(i) {
return(c(min(i),max(i)))
}
set.seed(12345)
id = c(rep(1:3,4))
x = sample(1:10, 12, replace=T)
df = data.frame(id,x)
res = ddply(df,.(id),summarise,val1 = min(x), val2 = max(x), val3 = ff(x)[1], val4 = ff(x)[2])
View(res)
id val1 val2 val3 val4
1 1 4 10 4 10
2 2 1 9 1 9
3 3 2 8 2 8
正如所料,val3 = val1
和val4 = val2
。但是我必须在ff
中调用函数ddply
两次,这在时间上不是最佳的。有没有办法在val
内为ddply
分配两个功能输出?如果我尝试使用[1:2]
或类似内容,则会收到错误消息:Error in eval(expr, envir, enclos) : length(rows) == 1 is not TRUE
谢谢!
编辑。感谢所有贡献者!大卫的解决方案工作速度提高了约2倍。并且它允许用中间结果进行进一步的操作。这是一个完全可重现的更新代码。
library(plyr)
library(data.table)
library(microbenchmark)
ff = function(i) {
return(c(min(i),max(i)))
}
set.seed(12345)
id = c(rep(1:3,4000))
x = runif(12000,1,10)
df = data.frame(id,id2,x)
View(df)
res = ddply(df,.(id),summarise,val1 = min(x), val2 = max(x), val3 = ff(x)[1], val4 = ff(x)[2], val5 = val3+val4, val6 = val3/val4)
View(res)
res2 = setDT(df)[, as.list(c(val1 = min(x), val2 = max(x), val3 = ff(x))), .(id)][, val5 := val31+val32][, val6 := val31/val32]
View(res2)
print(microbenchmark(ddply(df,.(id),summarise,val1 = min(x), val2 = max(x), val3 = ff(x)[1], val4 = ff(x)[2], val5 = val3+val4, val6 = val3/val4), times = 100))
print(microbenchmark(setDT(df)[, as.list(c(val1 = min(x), val2 = max(x), val3 = ff(x))), .(id)][, val5 := val31+val32][, val6 := val31/val32],times=100))
结果:
Unit: milliseconds
expr
ddply(df, .(id), summarise, val1 = min(x), val2 = max(x), val3 = ff(x)[1], val4 = ff(x)[2], val5 = val3 + val4, val6 = val3/val4)
min lq mean median uq max neval
3.042616 3.185358 5.976851 3.409828 3.925104 45.5157 100
Unit: milliseconds
expr
setDT(df)[, as.list(c(val1 = min(x), val2 = max(x), val3 = ff(x))), .(id)][, `:=`(val5, val31 + val32)][, `:=`(val6, val31/val32)]
min lq mean median uq max neval
1.968349 2.071747 2.285368 2.124206 2.251171 12.62967 100
答案 0 :(得分:1)
如果你构造你的函数来返回一个命名向量,那么data.table将接受它并用这些名称填充列,以重新命名所需的结构:
require(data.table)
ff = function(i) {
return(c(val3=min(i),val4=max(i)))
}
setDT(df)[, as.list(c(var1 = min(x), var2 = max(x), ff(x))), id]
#-----------
id var1 var2 val3 val4
1: 1 4 10 4 10
2: 2 1 9 1 9
3: 3 2 8 2 8