提取一系列子序列的最快方法

时间:2016-01-21 13:37:49

标签: r performance

是否有任何R特定函数,已经过优化,它执行以下操作:给定命名序列,创建一个包含每个步骤的数组,每个名称上的下一个元素?这是一个具有我自己功能的MWE:

getSS <- function(U){
  l <- length(U)
  n <- as.numeric(names(U))
  N <- max(n)
  SS <- matrix(nrow = N, ncol = l)
  start <- rep(1,N)
  for(i in 1:l){
    cur <- n[i]
    SS[cur, start[cur]:i] <- U[i]
    start[cur] = i+1
  }
  return(SS)
}

U <- rnorm(10); names(U) <- sample(1:3, 10, replace = TRUE)
getSS(U)

# > U
# 1          3          1          3          1          2          3          2          2          3 
# 1.9767055 -1.2574765  1.5140555  1.1170087  2.0574106 -1.7337048  1.8112673 -1.0141680 -0.3471426 -0.3978564 
# > getSS(U)
# [,1]      [,2]      [,3]      [,4]      [,5]      [,6]      [,7]       [,8]       [,9]      [,10]
# [1,]  1.976706  1.514055  1.514055  2.057411  2.057411        NA        NA         NA         NA         NA
# [2,] -1.733705 -1.733705 -1.733705 -1.733705 -1.733705 -1.733705 -1.014168 -1.0141680 -0.3471426         NA
# [3,] -1.257476 -1.257476  1.117009  1.117009  1.811267  1.811267  1.811267 -0.3978564 -0.3978564 -0.3978564

欢迎任何有关特定家庭实施的建议!

编辑:反对罗兰的基准建议:

roland <- function(U){
  res <- vapply(sort(unique(names(U))),
                function(name, vec) {
                  vec[names(vec) != name] <- NA
                  names(vec) <- NULL
                  na.locf(vec, fromLast = TRUE, na.rm = FALSE)
                },
                vec = U,
                FUN.VALUE = numeric(length(U)))
  t(res)
}

# > microbenchmark::microbenchmark(roland(U), getSS(U))
# Unit: microseconds
# expr     min       lq     mean   median       uq     max neval
# roland(U) 240.024 263.5120 302.9377 272.1465 291.6175 950.012   100
# getSS(U)  42.420  47.9515  60.6633  55.0665  67.0210 159.675   100

2 个答案:

答案 0 :(得分:1)

这是一个Rcpp函数:

library(Rcpp)
library(inline)
cppFunction(
  'NumericMatrix getSScpp(const NumericVector x, const IntegerVector iv) {
            const int c = x.length();
            const int r = max(iv);
            NumericMatrix SS(r,c);
            double fill;
            for (int i = 0; i < r; ++i) {
              fill = NA_REAL;
              for (int j = c-1; j >= 0; --j) {
                if (iv(j) == (i+1)) fill = x(j);
                SS(i,j) = fill;
              }
            }
            return SS;
  }')
all.equal(getSS(U),
          getSScpp(U, as.integer(names(U))))
#[1] TRUE

library(microbenchmark)
microbenchmark(getSS(U),
               getSScpp(U, as.integer(names(U))))
#Unit: microseconds
#                             expr    min     lq     mean  median      uq     max neval
#                         getSS(U) 24.189 28.186 31.74323 29.2130 34.8215 107.162   100
#getSScpp(U, as.integer(names(U)))  3.035  3.524  4.18695  3.8235  4.1440  16.338   100

答案 1 :(得分:0)

您可以使用vapplyna.locf

library(zoo)

res <- vapply(sort(unique(names(U))),
              function(name, vec) {
                vec[names(vec) != name] <- NA
                names(vec) <- NULL
                na.locf(vec, fromLast = TRUE, na.rm = FALSE)
              },
              vec = U,
              FUN.VALUE = numeric(length(U)))
t(res)