Question

我编写了将函数应用于数据框input的代码：

    set.seed(1234) 
    n = 5000000
    input <- as.matrix(data.frame(c1 = sample(1:10, n, replace = T), c2 = sample(1:10, n, replace = T), c3 = sample(1:10, n, replace = T), c4 = sample(1:10, n, replace = T)))

    system.time(
    test <- input %>% 
      split(1:nrow(input)) %>% 
      map(~ func1(.x, 2, 2, "test_1")) %>% 
      do.call("rbind", .))

## Here is the function used:

    func1 <- function(dataC, PR, DB, MT){

          c1 <- as.vector(dataC[1])
          c2 <- as.vector(dataC[2])

          c3 <- as.vector(dataC[3])
          c4 <- as.vector(dataC[4])

          newc1 <- -999
          newc2 <- -999

          if(MT=="test_1"){

            listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 + 1) : (c2 + PR)))
            V1 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB

            listC <- expand.grid(x = c((c1 - 1) : (c1 + 1)), y = c((c2 + 1) : (c2 + PR)))
            V2 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB

            listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 + 1) : (c2 + PR)))
            V3 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB

            listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 - 1) : (c2 + 1)))
            V4 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB

            V5 <- 0

            listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 - 1) : (c2 + 1)))
            V6 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB

            listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 - PR) : (c2 - 1)))
            V7 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB

            listC <- expand.grid(x = c((c1 - 1) : (c1 + 1)), y = c((c2 - PR) : (c2 - 1)))
            V8 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB

            listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 - PR) : (c2 - 1)))
            V9 <- mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB


          } else if(MT=="test_2"){

            listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 + 1) : (c2 + PR)))
            V1 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB

            listC <- expand.grid(x = c((c1 - 1) : (c1 + 1)), y = c((c2 + 1) : (c2 + PR)))
            V2 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB

            listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 + 1) : (c2 + PR)))
            V3 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB

            listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 - 1) : (c2 + 1)))
            V4 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB

            V5 <- 0

            listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 - 1) : (c2 + 1)))
            V6 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB

            listC <- expand.grid(x = c((c1 - PR) : (c1 - 1)), y = c((c2 - PR) : (c2 - 1)))
            V7 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB

            listC <- expand.grid(x = c((c1 - 1) : (c1 + 1)), y = c((c2 - PR) : (c2 - 1)))
            V8 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * DB

            listC <- expand.grid(x = c((c1 + 1) : (c1 + PR)), y = c((c2 - PR) : (c2 - 1)))
            V9 <- harmonic.mean(sample(1:10, size = dim(listC)[1], replace = TRUE)) * sqrt(2) * DB

          }

          tot <- sum(c(1/V1, 1/V2, 1/V3, 1/V4, 1/V6, 1/V7, 1/V8, 1/V9), na.rm = TRUE)
          mat_V <- matrix(data = c((1/V1)/tot, (1/V2)/tot, (1/V3)/tot, (1/V4)/tot, V5, 
                                        (1/V6)/tot, (1/V7)/tot, (1/V8)/tot, (1/V9)/tot), nrow = 3, ncol = 3, byrow = TRUE)

          while((newc1 == -999 && newc2 == -999) || (c3 == newc1 && c4 == newc2)){

            if(c3 == newc1 && c4 == newc2){
              mat_V[choiceC[1], choiceC[2]] <- NaN
              ## print(mat_V)
            }

            choiceC <- which(mat_V == max(mat_V, na.rm = TRUE), arr.ind = TRUE)
            ## print(choiceC)
            ## If there are several maximum values
            if(nrow(choiceC) > 1){
              choiceC <- choiceC[sample(1:nrow(choiceC), 1), ]
            }

            if(choiceC[1]==1 & choiceC[2]==1){

              newC <- matrix(c(x = c1 - 1, y = c2 + 1), ncol = 2)

            } else if(choiceC[1]==1 & choiceC[2]==2){

              newC <- matrix(c(x = c1, y = c2 + 1), ncol = 2)

            } else if(choiceC[1]==1 & choiceC[2]==3){

              newC <- matrix(c(x = c1 + 1, y = c2 + 1), ncol = 2)

            } else if(choiceC[1]==2 & choiceC[2]==1){

              newC <- matrix(c(x = c1 - 1, y = c2), ncol = 2)

            } else if(choiceC[1]==2 & choiceC[2]==3){

              newC <- matrix(c(x = c1 + 1, y = c2), ncol = 2)

            } else if(choiceC[1]==3 & choiceC[2]==1){

              newC <- matrix(c(x = c1 - 1, y = c2 - 1), ncol = 2)

            } else if(choiceC[1]==3 & choiceC[2]==2){

              newC <- matrix(c(x = c1, y = c2 - 1), ncol = 2)

            } else if(choiceC[1]==3 & choiceC[2]==3){ 

              newC <- matrix(c(x = c1 + 1, y = c2 - 1), ncol = 2)
            }

            newc1 <- as.vector(newC[,1])
            newc2 <- as.vector(newC[,2])

          }

          return(newC)

        }

该代码适用于小型数据集，但是当数据框包含超过一百万行时，它的速度将非常慢。我认为函数中有很多重复的代码行（例如条件if else）会降低速度。有没有办法一次完成函数中的所有计算？我非常感谢您提供任何建议。

Answer 1

首先要经过一番艰苦的恋爱，但我强烈建议您介绍基础知识，您的代码集中了一些不良做法，并且花一些时间研究矢量化等会获得巨大的投资回报。...请考虑发布下次在https://codereview.stackexchange.com/questions/tagged/r上进行讨论，因为这是一个更合适的问题。

您的瓶颈不是嵌套的ifs ，而是对expand.grid的不足使用。

您在代码中通过expand.grid创建了数据帧，从而错误地调用了listC（它们没有列出）。然后，此昂贵的data.frame仅用于其行数，这是通过dim(listC)[1]获得的，该行将更惯用的类型为nrow(listC)。

实际上，该值（dim(listC)[1]只能是PR^2或3*PR，因此您可以先计算它们，然后再使用它们。

嵌套的ifs 可以替换为嵌套的switch语句，更具可读性，并且通过仅测试一次就可以提高我们的效率。

它使我们看到您忘记了代码中的一个条件。请在下面查看改进的代码。

当它看起来更整洁时，我们发现实际上可以用newC <- c(c1 - 2 + choice[2], c2 + 2 - choice[1])代替它。

其他观察结果

注释您的代码，而不是对我们，对您（然后在决定发布问题时对我们）注释。
c2 <- as.vector(dataC[2])可以替换为c2 <- dataC[[2]]
t(c(1,2))可以代替matrix(c(x = 1, y = 2), ncol = 2)来构建2列和1行的矩阵，但是如果最终要在其上使用as.vector，请执行{{ 1}}首先
代码可能会进一步优化

修改后的代码

c(1,2)

在R中一次运行多个条件

1 个答案: