在达到特定值时重置cumsum

时间:2016-05-31 17:27:32

标签: r performance grouping cumsum

我想在向量上重置cumsum,因为它达到了某个值。

E.g。对于以下向量:

v <- c(3, 5, 2, 5, 3, 4, 5, 3, 1, 4)

预期输出为:

c(0, 0, 10, 0, 0, 22, 0, 30, 0, 0)

使用reset <- 10我可以将任务减少到标记完整整数后的第一个值:

res <- cumsum(v)
resd <- res/reset
resd
# [1] 0.3 0.8 1.0 1.5 1.8 2.2 2.7 3.0 3.1 3.5

预期输出为:

c(F, F, T, F, F, T, F, T, F, F) # or 
c(0, 0, 1.0, 0, 0, 2.2, 0, 3.0, 0, 0)

我需要一种快速的方法来计算其中一种。

5 个答案:

答案 0 :(得分:7)

我的(改进的)解决方案:

v <- c(3, 5, 2, 5, 3, 4, 5, 3, 1, 4)
res <- cumsum(v)
reset <- 10
resd <- res/reset
res[diff(c(0, floor(resd))) == 0] <- 0
print(res) #gives 0  0 10  0  0 22  0 30  0  0

编辑:现在v中的第一个元素可能大于10。

答案 1 :(得分:3)

另一种可能的方法:

v <- c(3, 5, 2, 5, 3, 4, 5, 3, 1, 4)
reset <- 10
s <- cumsum(v)
idx <- as.integer(s / reset)
logic <- idx >= 1 & !duplicated(idx)

> logic
[1] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE

# corresponding one-liner
logic <- with(list(idx=as.integer(cumsum(v) / reset)),idx >= 1 & !duplicated(idx))

为了好玩,我还创建了该功能的Rcpp版本:

library(Rcpp)
library(inline)

cumsumResetRcpp <- cxxfunction(signature(values='numeric',reset='integer'),
'
  Rcpp::IntegerVector r(reset);
  int resetVal = r[0];
  Rcpp::NumericVector v(values);
  int n = v.size();
  Rcpp::NumericVector result(n);
  double cumsum = 0;
  for(int i = 0; i < n; i++){
    int prevCumSumFloor = (int)(cumsum / resetVal);
    cumsum += v[i];
    int currCumSumFloor = (int)(cumsum / resetVal);
    if(currCumSumFloor > prevCumSumFloor)
      result[i] = cumsum;
  }
  return( result ) ;
', plugin="Rcpp", verbose=FALSE,includes='')

与我以前的版本比较:

library(microbenchmark)

baseRVersion <- function(v,reset){
   a <- cumsum(v)
   a[!with(list(idx=as.integer(a / reset)),idx >= 1 & !duplicated(idx))] <- 0
   a
}

RcppVersion <- function(v,reset){
  cumsumResetRcpp(v,reset)
}

set.seed(1234)
v <- sample(5,1e6,replace=TRUE)

microbenchmark(baseRVersion(v,10), RcppVersion(v,10),times=20)


# Result :
   Unit: milliseconds
                expr      min       lq     mean    median       uq      max neval
 baseRVersion(v, 10) 69.78914 74.34717 91.67828 102.95764 103.6911 105.4055    20
  RcppVersion(v, 10) 17.28785 17.58432 18.89449  19.25759  19.8595  20.5627    20

答案 2 :(得分:3)

设置所有小于10的cumsums或将模数除以10的值重复为零的那些:

a <- cumsum(v)
 a %/% 10
 [1] 0 0 1 1 1 2 2 3 3 3

a[ duplicated(a %/% 10) | a<10 ] <- 0
a
 [1]  0  0 10  0  0 22  0 30  0  0

答案 3 :(得分:2)

因为我永远无法抵抗......

qaswed <-function(v) {
res <- cumsum(v)
reset <- 10
resd <- res/reset
res[diff(c(0, floor(resd))) == 0] <- 0
}

digemall <-function(v){
reset <- 10
 with(list(idx=as.integer(cumsum(v) / reset)),idx >= 1 & !duplicated(idx))
 }

 colonel <-function(v){
 ifelse(c(0, diff(cumsum(v) %/% 10)), cumsum(v), 0)
 }

 userx <- function(v){
 a <- cumsum(v)
c(a[1] >= 10, a[-1] %/% 10 > a[-length(a)] %/% 10)
}

set.seed(5)
v <- sample(5,1e6,replace=TRUE)

microbenchmark(qaswed(v),digemall(v),colonel(v),userx(v),times=10)



 Unit: milliseconds
        expr       min        lq      mean    median        uq       max neval
   qaswed(v)  45.97558  50.29943  86.54772  85.52356  88.60232 200.89699    10
 digemall(v)  54.12038  58.85200  67.15433  60.51172  64.40194  99.32623    10
  colonel(v) 200.80942 233.56203 254.33662 252.65635 275.16588 306.76971    10
    userx(v)  53.87098  56.55786  71.38571  57.98169  92.94224  96.69956    10

答案 4 :(得分:1)

v <- c(3, 5, 2, 5, 3, 4, 5, 3, 1, 4)
a <- cumsum(v)
c(a[1] >= 10, a[-1] %/% 10 > a[-length(a)] %/% 10)

输出:

[1] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE