我编写了将函数应用于数据框input
的代码:
set.seed(1234)
n = 5000000
input <- as.matrix(data.frame(c1 = sample(1:10, n, replace = T), c2 = sample(1:10, n, replace = T), c3 = sample(1:10, n, replace = T), c4 = sample(1:10, n, replace = T)))
system.time(
test <- input %>%
split(1:nrow(input)) %>%
map(~ func1(.x, 2, 2, "test_1")) %>%
do.call("rbind", .))
## Here is the function used:
func1 <- function(dataC, PR, DB, MT){
c1 <- as.vector(dataC[1])
c2 <- as.vector(dataC[2])
c3 <- as.vector(dataC[3])
c4 <- as.vector(dataC[4])
newc1 <- -999
newc2 <- -999
if(MT=="test_1"){
listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 + 1) : (c2 + PR)))
V1 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB
listC <- expand.grid(x = c((c1 - 1) : (c1 + 1)), y = c((c2 + 1) : (c2 + PR)))
V2 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB
listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 + 1) : (c2 + PR)))
V3 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB
listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 - 1) : (c2 + 1)))
V4 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB
V5 <- 0
listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 - 1) : (c2 + 1)))
V6 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB
listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 - PR) : (c2 - 1)))
V7 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB
listC <- expand.grid(x = c((c1 - 1) : (c1 + 1)), y = c((c2 - PR) : (c2 - 1)))
V8 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB
listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 - PR) : (c2 - 1)))
V9 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB
} else if(MT=="test_2"){
listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 + 1) : (c2 + PR)))
V1 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB
listC <- expand.grid(x = c((c1 - 1) : (c1 + 1)), y = c((c2 + 1) : (c2 + PR)))
V2 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB
listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 + 1) : (c2 + PR)))
V3 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB
listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 - 1) : (c2 + 1)))
V4 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB
V5 <- 0
listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 - 1) : (c2 + 1)))
V6 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB
listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 - PR) : (c2 - 1)))
V7 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB
listC <- expand.grid(x = c((c1 - 1) : (c1 + 1)), y = c((c2 - PR) : (c2 - 1)))
V8 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB
listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 - PR) : (c2 - 1)))
V9 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB
}
tot <- sum(c(1/V1, 1/V2, 1/V3, 1/V4, 1/V6, 1/V7, 1/V8, 1/V9), na.rm = TRUE)
mat_V <- matrix(data = c((1/V1)/tot, (1/V2)/tot, (1/V3)/tot, (1/V4)/tot, V5,
(1/V6)/tot, (1/V7)/tot, (1/V8)/tot, (1/V9)/tot), nrow = 3, ncol = 3, byrow = TRUE)
while((newc1 == -999 && newc2 == -999) || (c3 == newc1 && c4 == newc2)){
if(c3 == newc1 && c4 == newc2){
mat_V[choiceC[1], choiceC[2]] <- NaN
## print(mat_V)
}
choiceC <- which(mat_V == max(mat_V, na.rm = TRUE), arr.ind = TRUE)
## print(choiceC)
## If there are several maximum values
if(nrow(choiceC) > 1){
choiceC <- choiceC[sample(1:nrow(choiceC), 1), ]
}
if(choiceC[1]==1 & choiceC[2]==1){
newC <- matrix(c(x = c1 - 1, y = c2 + 1), ncol = 2)
} else if(choiceC[1]==1 & choiceC[2]==2){
newC <- matrix(c(x = c1, y = c2 + 1), ncol = 2)
} else if(choiceC[1]==1 & choiceC[2]==3){
newC <- matrix(c(x = c1 + 1, y = c2 + 1), ncol = 2)
} else if(choiceC[1]==2 & choiceC[2]==1){
newC <- matrix(c(x = c1 - 1, y = c2), ncol = 2)
} else if(choiceC[1]==2 & choiceC[2]==3){
newC <- matrix(c(x = c1 + 1, y = c2), ncol = 2)
} else if(choiceC[1]==3 & choiceC[2]==1){
newC <- matrix(c(x = c1 - 1, y = c2 - 1), ncol = 2)
} else if(choiceC[1]==3 & choiceC[2]==2){
newC <- matrix(c(x = c1, y = c2 - 1), ncol = 2)
} else if(choiceC[1]==3 & choiceC[2]==3){
newC <- matrix(c(x = c1 + 1, y = c2 - 1), ncol = 2)
}
newc1 <- as.vector(newC[,1])
newc2 <- as.vector(newC[,2])
}
return(newC)
}
该代码适用于小型数据集,但是当数据框包含超过一百万行时,它的速度将非常慢。我认为函数中有很多重复的代码行(例如条件if else
)会降低速度。有没有办法一次完成函数中的所有计算?我非常感谢您提供任何建议。
答案 0 :(得分:10)
首先要经过一番艰苦的恋爱,但我强烈建议您介绍基础知识,您的代码集中了一些不良做法,并且花一些时间研究矢量化等会获得巨大的投资回报。...请考虑发布下次在https://codereview.stackexchange.com/questions/tagged/r上进行讨论,因为这是一个更合适的问题。
您的瓶颈不是嵌套的ifs ,而是对expand.grid
的不足使用。
您在代码中通过expand.grid
创建了数据帧,从而错误地调用了listC
(它们没有列出)。然后,此昂贵的data.frame仅用于其行数,这是通过dim(listC)[1]
获得的,该行将更惯用的类型为nrow(listC)
。
实际上,该值(dim(listC)[1]
只能是PR^2
或3*PR
,因此您可以先计算它们,然后再使用它们。
嵌套的ifs 可以替换为嵌套的switch语句,更具可读性,并且 通过仅测试一次就可以提高我们的效率。
它使我们看到您忘记了代码中的一个条件。请在下面查看改进的代码。
当它看起来更整洁时,我们发现实际上可以用newC <- c(c1 - 2 + choice[2], c2 + 2 - choice[1])
代替它。
其他观察结果
c2 <- as.vector(dataC[2])
可以替换为c2 <- dataC[[2]]
t(c(1,2))
可以代替matrix(c(x = 1, y = 2), ncol = 2)
来构建2列和1行的矩阵,但是如果最终要在其上使用as.vector
,请执行{{ 1}}首先修改后的代码
c(1,2)