Question

我有一个巨大的data.frame，其中包含多个NA值。如果经常发生许多NA值，我似乎遇到了问题。

是否有一种简单的方法可以找到出现NA值的行，例如一个接一个20次，但不是20个NA值出现孤立的那个？

编辑（由agstudy添加）

接受的解决方案使用apply，这对于hudge矩阵效率不高。因此，我编辑了解决方案（我添加了Rcpp标记）以请求更多高效解决方案。

Answer 1

您可以使用complete.cases创建一个与rle相似的函数来计算连续的缺失值：

cons.missings <- 
function(dat,n)
apply(is.na(dat),1,function(x){
  yy <- rle(x)
  any(yy$lengths[yy$values]>n)
})

然后只保留好行：

dat[!cons.missings(dat,20),]

4个连接词缺失值的示例：

dat <- as.matrix(t(data.frame(a= c(1,rep(NA,4),5),
           b= c(2,rep(NA,2),1,rep(NA,2)))))

 [,1] [,2] [,3] [,4] [,5] [,6]
a    1   NA   NA   NA   NA    5
b    2   NA   NA    1   NA   NA

dat[!cons.missings(dat,3),]
[1]  2 NA NA  1 NA NA

Answer 2

虽然不是你所说的“Rcpp”，但这里有一个使用.Call的替代方案，这似乎是有效的：

library(inline)

ff = cfunction(sig = c(R_mat = "matrix", R_n = "numeric"), body = '
    SEXP mat, dims, ans;

    PROTECT(mat = coerceVector(R_mat, REALSXP)); //or `as.numeric(.)` in R
    PROTECT(dims = getAttrib(R_mat, R_DimSymbol));
    R_len_t rows = INTEGER(dims)[0], cols = INTEGER(dims)[1];
    R_len_t n = INTEGER(coerceVector(R_n, INTSXP))[0];

    R_len_t *buf = (int *) R_alloc(rows, sizeof(int)), b = 0; //dynamic allocation 
                                                             //of a pointer to store 
                                                             //the indices of rows
                                                          //that match the criterion.
                                                           //a classic usage of this
                                                         //is in do_which (summary.c)

    double *pmat = REAL(mat);  //pointer to the matrix input
    for(int ir = 0; ir < rows; ir++) {
       R_len_t COUNT_CONS_NAS = 0;
       for(int ic = 0; ic < cols; ic++) {
           if(ISNAN(pmat[ir + ic*rows])) { //if NA is found
               COUNT_CONS_NAS++;          //start counting NAs  
               if(COUNT_CONS_NAS == n) break;  //no need to search all columns
           }
           else {
               COUNT_CONS_NAS = 0; //if not NA, counter back to zero 
           }
       }
       if(COUNT_CONS_NAS == n) {  //if the specific row matched the criterion
           buf[b] = ir + 1;   //store its index
           b++;
       }
    }

    PROTECT(ans = allocVector(INTSXP, b));  //allocate a vector with 
                                           //length = No rows that matched criterion
    memcpy(INTEGER(ans), buf, sizeof(int)*b);  //copy rows indices to 
                                               //the pointer of ans

    UNPROTECT(3);

    return(ans);
')


set.seed(11);mat = matrix(sample(c(NA, 0:2), 30, T, prob = c(0.7, 0.1, 0.1, 0.1)), 6)
mat
#     [,1] [,2] [,3] [,4] [,5]
#[1,]   NA   NA    0   NA   NA
#[2,]   NA   NA    2   NA   NA
#[3,]   NA    2    1   NA   NA
#[4,]   NA   NA   NA   NA   NA
#[5,]   NA   NA   NA   NA   NA
#[6,]    0   NA   NA   NA   NA
ff(mat, 3)
#[1] 4 5 6    
mat[-ff(mat, 3),]      
#     [,1] [,2] [,3] [,4] [,5]
#[1,]   NA   NA    0   NA   NA
#[2,]   NA   NA    2   NA   NA
#[3,]   NA    2    1   NA   NA

还有一些基准测试：

#library(Rcpp) ; sourceCpp("~/ffcpp.cpp")
identical(dat[!cons.missings(dat,3), ], dat[cons_missings(is.na(dat),3), ])
#[1] TRUE
identical(dat[!cons.missings(dat,3), ], dat[-ff(dat, 4), ])
#[1] TRUE
library(microbenchmark)
microbenchmark(dat[!cons.missings(dat,3), ], 
               dat[cons_missings(is.na(dat),3), ],
               dat[-ff(dat, 4), ], times = 10)
#Unit: milliseconds
                                expr         min          lq      median         uq        max neval
       #dat[!cons.missings(dat, 3), ] 3628.960362 3674.531704 3777.270890 3840.79075 3885.58709    10
 #dat[cons_missings(is.na(dat), 3), ] 5256.550903 5267.137257 5325.497516 5365.13947 5554.88023    10
                  #dat[-ff(dat, 4), ]    6.444897    7.749669    8.971304   11.35649   58.94499    10

#the rows that each function will remove
resff <- ff(dat, 4)
rescons.mis <- which(cons.missings(dat,3)) 
rescons_mis <- seq_len(nrow(dat))[-cons_missings(is.na(dat),3)]

sum(resff != rescons.mis)
#[1] 0
sum(resff != rescons_mis)
#[1] 0
sum(rescons_mis != rescons.mis)
#[1] 0
length(resff)
#[1] 5671
length(rescons.mis)
#[1] 5671
length(rescons_mis)
#[1] 5671

Answer 3

我使用Rcpp添加另一个答案，因为OP使用的是大矩阵。我不是Rcpp精通，所以即使我认为我试图实现有效的rle missings算法，我也无法获得更好的解决方案。

#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
bool maxMissingSequence(IntegerVector x,int n) {

  // Initialise first value
  int lmissings = 1;
  double prev = x[0];
  for(IntegerVector::iterator it = x.begin() + 1; it != x.end(); ++it) {
    if (prev == *it && prev==1)lmissings++;
    if(lmissings >n) break;   // we are OK
    if(*it==0) lmissings =1;  // reset counter
    prev = *it;
  }
  return lmissings >n;
}

// [[Rcpp::export]]
IntegerVector cons_missings(IntegerMatrix Im, int n ){
   IntegerVector res ; 
   int nrows = Im.nrow();
   for (int i = 0; i < nrows; i++)
      if(!maxMissingSequence(Im(i,_),n))
         res.push_back(i+1);
  return res;
}

Benchmarkings

set.seed(2)
N <- 3*1e5
dat <- matrix(sample(c(1,NA),N,replace=TRUE),ncol=5)

cons.missings <- 
function(dat,n)
apply(is.na(dat),1,function(x){
  yy <- rle(x)
  any(yy$lengths[yy$values]>n)
})


identical(dat[!cons.missings(dat,3),],dat[cons_missings(is.na(dat),3),])
[1] TRUE

system.time(dat[!cons.missings(dat,3),])
   user  system elapsed 
   4.24    0.02    4.35 

> system.time(dat[cons_missings(is.na(dat),3),])
   user  system elapsed 
   6.34    0.00    6.48

在data.frame行中查找NA值的序列

3 个答案:

Benchmarkings