Question

下面的可复制示例可以用作测试用例。我正在寻找base-R解决方案，因为

我对C ++（集成Rcpp）或Java（集成rJava）没有经验
因此，“速度问题”可能是算法固有的
我喜欢R的报告工具和快速原型制作的优势。

背景（根据@Gregor的输入）：

最初的问题是骑行问题。因此，我们面临一个问题：“合并两个游览更好吗？”数十亿次。下面的示例具有相同的结构（两个地点的appart多远？下一个停下来的最佳地点是哪个？合并两个游览会产生什么结果？）。
我们了解出色的answer on how to speed up R-code，但在我们的案例中，我们还有一个附加的“业务逻辑”，它会在各种if-else语句中显示。因此，当出现if-else语句时，对我们来说似乎不可能实现矢量化代码-但可能我们错过了一些东西。

可复制示例

generate_random_sequence <- function(nrows=1000) {
  x <- c(0, runif(nrows, min = 0, max = 100), 0)
  y <- c(0, runif(nrows, min = 0, max = 100), 0)
  loc <- c("Start", floor(runif(nrows, 1000, 9999)), "Start")
  return(data.table::data.table(x = x, y = y, loc = loc, stringsAsFactors = FALSE))
}

dist <- function(x1, y1, x2, y2) {
  return(sqrt((x1-x2)^2 + (y1-y2)^2))
}

get_best_loc <- function(cur_loc, stop1, stop2) {
  d1 <- dist(cur_loc$x, cur_loc$y, stop1$x, stop1$y)
  d2 <- dist(cur_loc$x, cur_loc$y, stop2$x, stop2$y)
  if (d1 <= d2) {
    best_stop <- stop1
    stop_id <- 1L
  } else {
    best_stop <- stop2
    stop_id <- 2L
  }
  return(list(best_stop = best_stop, stop_id = stop_id))
}

combine_sequence <- function(t1, t2) {
  t_combined <- c(1, rep(NA, nrow(t1) + nrow(t2) - 4), 1)
  ind_max <- c(nrow(t1) - 1, nrow(t2) - 1)
  last_stop <- t1[1, ]
  ind_next_stop <- c(2, 2)
  ind_t_combined <- 1
  while (all(ind_next_stop <= ind_max)) {
    r <- get_best_loc(last_stop, 
                        t1[ind_next_stop[1],], t2[ind_next_stop[2],])
    best_stop <- r$best_stop; stop_id <- r$stop_id; rm(r)
    if (stop_id == 1) {
      ind_next_stop[1] <- ind_next_stop[1] + 1
    } else {
      ind_next_stop[2] <- ind_next_stop[2] + 1
    }
    ind_t_combined <- ind_t_combined + 1
    t_combined[ind_t_combined] <- stop_id
    last_stop <- best_stop
  }

  if (ind_next_stop[1] < ind_max[1]) {
    t_combined[(ind_t_combined + 1):(length(t_combined) - 1)] <- 
      t1$loc[ind_next_stop[1]:(nrow(t1) - 1)]
  } else {
    t_combined[(ind_t_combined + 1):(length(t_combined) - 1)] <- 
      t2$loc[ind_next_stop[2]:(nrow(t2) - 1)]
  }
  return(t_combined)
}

n <- 1e2
t1 <- generate_random_sequence(n)
t2 <- generate_random_sequence(n)
microbenchmark::microbenchmark(combine_sequence(t1, t2),
                               times = 10L, unit = "s")

评论输入：

使用分析时，得到以下输出。对我来说，目前尚不清楚，如果我有杠杆可以达到x1000的加速比。（我只是在黑暗中）

分析代码：

  profvis::profvis({
    t_combined <- c(1, rep(NA, nrow(t1) + nrow(t2) - 4), 1)
    ind_max <- c(nrow(t1) - 1, nrow(t2) - 1)
    last_stop <- t1[1, ]
    ind_next_stop <- c(2, 2)
    ind_t_combined <- 1
    while (all(ind_next_stop <= ind_max)) {
      r <- get_best_loc(last_stop, 
                        t1[ind_next_stop[1],], t2[ind_next_stop[2],])
      best_stop <- r$best_stop; stop_id <- r$stop_id; rm(r)
      if (stop_id == 1) {
        ind_next_stop[1] <- ind_next_stop[1] + 1
      } else {
        ind_next_stop[2] <- ind_next_stop[2] + 1
      }
      ind_t_combined <- ind_t_combined + 1
      t_combined[ind_t_combined] <- stop_id
      last_stop <- best_stop
    }

    if (ind_next_stop[1] < ind_max[1]) {
      t_combined[(ind_t_combined + 1):(length(t_combined) - 1)] <- 
        t1$loc[ind_next_stop[1]:(nrow(t1) - 1)]
    } else {
      t_combined[(ind_t_combined + 1):(length(t_combined) - 1)] <- 
        t2$loc[ind_next_stop[2]:(nrow(t2) - 1)]
    }
})

使用base-R是否可以将代码加速〜1000倍？

0 个答案: