有条件地根据索引

时间:2017-07-04 14:18:56

标签: r

最好用一个例子来解释。

我有一个名为data.frame的{​​{1}}的向量或列:

vec

我想要一个矢量化过程(不是vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA) 循环),以便在观察到for时更改三个尾随NA

结束向量是:

1

如果我们有:

c(NA, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, NA, NA)

结束向量看起来像:

vec <- c(NA, NA, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, NA)

一个写得非常糟糕的解决方案是:

c(NA, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA)

7 个答案:

答案 0 :(得分:17)

另一种选择:

`[<-`(vec,c(outer(which(vec==1),1:3,"+")),1)
# [1] NA NA  1  1  1  1 NA  1  1  1  1 NA NA NA

虽然以上内容适用于示例,但如果在最后位置找到1,则会延长vec的长度。最好做一个简单的检查并包装成一个函数:

threeNAs<-function(vec) {
    ind<-c(outer(which(vec==1),1:3,"+"))
    ind<-ind[ind<=length(vec)]
    `[<-`(vec,ind,1)
}

答案 1 :(得分:13)

另一个快速解决方案:

vec[rep(which(vec == 1), each = 3) + c(1:3)] <- 1

给出:

> vec
 [1] NA NA  1  1  1  1 NA  1  1  1  1 NA NA NA

基准测试仅适用于较大的数据集。一个带有10k大向量的基准测试和几个已发布的解决方案:

library(microbenchmark)

microbenchmark(ans.jaap = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4); 
                           vec[rep(which(vec == 1), each = 3) + c(1:3)] <- 1},
               ans.989 = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
                          r <- which(vec==1);
                          vec[c(mapply(seq, r, r+3))] <- 1},
               ans.sotos = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
                            vec[unique(as.vector(t(sapply(which(vec == 1), function(i) seq(i+1, length.out = 3)))))] <- 1},
               ans.gregor = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
                             vec[is.na(vec)] <- 0;
                             n <- length(vec);
                             vec <- vec + c(0, vec[1:(n-1)]) + c(0, 0, vec[1:(n-2)]) + c(0, 0, 0, vec[1:(n-3)]);
                             vec[vec == 0] <- NA},
               ans.moody = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
                            output <- sapply(1:length(vec),function(i){any(!is.na(vec[max(0,i-3):i]))});
                            output[output] <- 1;
                            output[output==0] <- NA},
               ans.nicola = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
                             `[<-`(vec,c(outer(which(vec==1),1:3,"+")),1)})

给出了以下基准:

Unit: microseconds
       expr        min         lq       mean     median         uq        max neval   cld
   ans.jaap   1778.905   1937.414   3064.686   2100.595   2257.695  86233.593   100 a    
    ans.989  87688.166  89638.133  96992.231  90986.269  93326.393 182431.366   100   c  
  ans.sotos 125344.157 127968.113 132386.664 130117.438 132951.380 214460.174   100    d 
 ans.gregor   4036.642   5824.474  10861.373   6533.791   7654.587  87806.955   100  b   
  ans.moody 173146.810 178369.220 183698.670 180318.799 184000.062 264892.878   100     e
 ans.nicola    966.927   1390.486   1723.395   1604.037   1904.695   3310.203   100 a

答案 2 :(得分:5)

究竟是什么&#39; vectorised&#39;,如果不是用C语言编写的循环?

这是一个基准测试很好的C ++循环。

vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)

library(Rcpp)

cppFunction('NumericVector fixVec(NumericVector myVec){

    int n = myVec.size();
    int foundCount = 0;

    for(int i = 0; i < n; i++){
      if(myVec[i] == 1) foundCount = 1; 

      if(ISNA(myVec[i])){
        if(foundCount >= 1 & foundCount <= 3){
          myVec[i] = 1;
          foundCount++;
        }
      }
    }
    return myVec;
    }')

fixVec(vec)
# [1] NA NA  1  1  1  1 NA  1  1  1  1 NA NA NA

基准

library(microbenchmark)

microbenchmark(
      ans.jaap = {
        vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4); 
      vec[rep(which(vec == 1), each = 4) + c(0:3)] <- 1
},

    ans.nicola = {
        vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
      `[<-`(vec,c(outer(which(vec==1),0:3,"+")),1)
        },

    ans.symbolix = {
        vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
      vec <- fixVec(vec)
        }
)

# Unit: microseconds
# expr              min       lq      mean   median        uq       max neval
# ans.jaap     2017.789 2264.318 2905.2437 2579.315 3588.4850  4667.249   100
# ans.nicola   1242.002 1626.704 3839.4768 2095.311 3066.4795 81299.962   100
# ans.symbolix  504.577  533.426  838.5661  718.275  966.9245  2354.373   100


vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4)
vec <- fixVec(vec)

vec2 <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4)
vec2[rep(which(vec2 == 1), each = 4) + c(0:3)] <- 1

identical(vec, vec2)
# [1] TRUE

答案 3 :(得分:3)

以下代码执行您要求的操作。它涉及&#34;转移&#34;向量然后添加移位版本

vec[is.na(vec)] <- 0                                 
n <- length(vec)                                     
vec <- vec + c(0, vec[1:(n-1)]) + c(0, 0, vec[1:(n-2)]) + c(0, 0, 0, vec[1:(n-3)])  
vec[vec == 0] <- NA                                    
vec[vec != 0] <- 1                                     

# vec                    |   0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0 ,0, 0
# c(0, vec[1:(n-1)])     | + 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0 ,0, 0
# c(0, 0, vec[1:(n-2)])  | + 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0 ,0
# c(0,0,0,vec[1:(n-3)])  | + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0 
#                        |-------------------------------------------
#                        |   0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0 

答案 4 :(得分:2)

非矢量化解决方案,但是,使用基础R的另一个选项,

vec[unique(as.vector(t(sapply(which(vec == 1), function(i) seq(i+1, length.out = 3)))))] <- 1
vec
#[1] NA NA  1  1  1  1 NA  1  1  1  1 NA NA NA

vec1[unique(as.vector(t(sapply(which(vec1 == 1), function(i) seq(i+1, length.out = 3)))))] <- 1
vec1
#[1] NA NA  1  1  1  1  1  1  1  1  1 NA NA NA

答案 5 :(得分:0)

使用sapplyanyis.na

output <- sapply(1:length(vec),function(i){any(!is.na(vec[max(0,i-3):i]))})
output[output] <- 1
output[output==0] <- NA

答案 6 :(得分:0)

这个怎么样:

r <- which(vec==1)
vec[c(mapply(seq, r, r+3))] <- 1

示例:

vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)
#[1] NA NA  1  1  1  1 NA  1  1  1  1 NA NA NA

vec <- c(NA, NA, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, NA)
#[1] NA NA  1  1  1  1  1  1  1  1  1 NA NA NA