我有这样的数据集:
set.seed(1000)
df <- data.frame(exp = c("A", "A", "A", "A", "A", rep("B", 5)), di = rnorm(10, 20, 3), x = rnorm(10, 5, 0.2), y = rnorm(10, 5, 0.2))
实际上它是巨大的(> 10000行),但设计是相似的。我想计算数据集中每行的新值,该值是组中值的总和,大于行中的值。对于小数据集,这样的循环工作正常:
df$comp1 <- 0
for (i in 1:nrow(df)){
for (j in 1:nrow(df)){
dist = sqrt((df$x[j] - df$x[i])^2 + (df$y[j] - df$y[i])^2)
if (dist < 0.4 & df$exp[i] == df$exp[j] & df$di[j] > df$di[i]){
df$comp1[i] = df$comp1[i] + df$di[j]
}
}
}
结果应该是这样的
exp di x y comp1
1 A 18.66267 4.803514 5.534014 0.00000
2 A 16.38243 4.889102 4.754597 39.55850
3 A 20.12338 5.024276 5.166849 21.91817
4 A 21.91817 4.975826 5.106514 0.00000
5 A 17.64034 4.732792 4.870635 21.91817
6 B 18.84353 5.034011 5.120632 22.15925
7 B 18.57240 5.031016 4.643231 0.00000
8 B 22.15925 5.004986 5.066988 0.00000
9 B 19.94448 4.590683 5.112195 0.00000
10 B 15.88065 5.042631 5.244187 41.00278
但是,在整个数据集上运行它需要很长时间。任何想法如何加快它。 BR
答案 0 :(得分:2)
如果我正确满足您的要求,可以使用库data.table
的一个选项:
library(data.table)
setDT(df)
df[,comp:={ lv=(df$exp == exp); dx=x-df$x[lv] ;dy=y-df$y[lv]; lv2=((dx^2+dy^2) < 0.4^2 & di<df$di[lv]); sum(df$di[lv2 & lv])}, by=1:nrow(df)]
结果:
exp di x y comp
1: A 18.66267 4.803514 5.534014 0.00000
2: A 16.38243 4.889102 4.754597 39.55850
3: A 20.12338 5.024276 5.166849 21.91817
4: A 21.91817 4.975826 5.106514 0.00000
5: A 17.64034 4.732792 4.870635 21.91817
6: B 18.84353 5.034011 5.120632 22.15925
7: B 18.57240 5.031016 4.643231 0.00000
8: B 22.15925 5.004986 5.066988 0.00000
9: B 19.94448 4.590683 5.112195 0.00000
10: B 15.88065 5.042631 5.244187 41.00278
详细了解data.table调用的j
部分内所做的事情:
lv=(df$exp == exp); # Get a logical vector of the initial df to subset
dx=x-df$x[lv] # Do the differences between actual row x and all others by exp
dy=y-df$y[lv] # same for y
lv2=((dx^2+dy^2) < 0.4^2 & di<df$di[lv]) # make a logical vector where the distance is within a 0.4 radius, and the values above current value.
sum(df$di[lv2 & lv])]) # sum the di which match the union of 'exp' value with radius and absolute value computed above.
语法comp:={}
允许我们创建一个填充了内部函数结果的新列。
对这个小数据集进行基准测试:
使用的代码:
set.seed(1000)
entry <- data.frame(exp = c("A", "A", "A", "A", "A", rep("B", 5)), di = rnorm(10, 20, 3), x = rnorm(10, 5, 0.2), y = rnorm(10, 5, 0.2))
f.dt <-function(df) {
setDT(df)
df[,comp1:={lv=(df$exp == exp); dx=x-df$x[lv];dy=y-df$y[lv];lv2=((dx^2+dy^2)<0.4^2 & di<df$di[lv]);sum(df$di[lv2 & lv])},by=1:nrow(df)][]
}
f.ori <-function(df) {
df$comp1 <- 0
for (i in 1:nrow(df)){
for (j in 1:nrow(df)){
dist = sqrt((df$x[j] - df$x[i])^2 + (df$y[j] - df$y[i])^2)
if (dist < 0.4 & df$exp[i] == df$exp[j] & df$di[j] > df$di[i]){
df$comp1[i] = df$comp1[i] + df$di[j]
}
}
}
df
}
检查和基准:
> identical(f.ori(entry),f.dt(entry))
[1] TRUE
> library(microbenchmark)
> microbenchmark(f.ori(entry),f.dt(entry),times=10)
Unit: milliseconds
expr min lq mean median uq max neval cld
f.ori(entry) 16.597134 16.790708 24.687390 17.854078 18.036534 69.344878 10 b
f.dt(entry) 2.812088 3.013074 3.127194 3.126967 3.209214 3.492588 10 a
所以它在这个小数据集上的速度提高了大约6倍,在较大的数据集上,我认为速度的增加将呈指数级增长。