我有一个巨大的data.frame
,其中包含多个NA
值。如果经常发生许多NA
值,我似乎遇到了问题。
是否有一种简单的方法可以找到出现NA
值的行,例如一个接一个20次,但不是20个NA
值出现孤立的那个?
编辑(由agstudy添加)
接受的解决方案使用apply
,这对于hudge矩阵效率不高。因此,我编辑了解决方案(我添加了Rcpp
标记)以请求更多高效解决方案。
答案 0 :(得分:3)
您可以使用complete.cases
创建一个与rle
相似的函数来计算连续的缺失值:
cons.missings <-
function(dat,n)
apply(is.na(dat),1,function(x){
yy <- rle(x)
any(yy$lengths[yy$values]>n)
})
然后只保留好行:
dat[!cons.missings(dat,20),]
4个连接词缺失值的示例:
dat <- as.matrix(t(data.frame(a= c(1,rep(NA,4),5),
b= c(2,rep(NA,2),1,rep(NA,2)))))
[,1] [,2] [,3] [,4] [,5] [,6]
a 1 NA NA NA NA 5
b 2 NA NA 1 NA NA
dat[!cons.missings(dat,3),]
[1] 2 NA NA 1 NA NA
答案 1 :(得分:2)
虽然不是你所说的“Rcpp”,但这里有一个使用.Call
的替代方案,这似乎是有效的:
library(inline)
ff = cfunction(sig = c(R_mat = "matrix", R_n = "numeric"), body = '
SEXP mat, dims, ans;
PROTECT(mat = coerceVector(R_mat, REALSXP)); //or `as.numeric(.)` in R
PROTECT(dims = getAttrib(R_mat, R_DimSymbol));
R_len_t rows = INTEGER(dims)[0], cols = INTEGER(dims)[1];
R_len_t n = INTEGER(coerceVector(R_n, INTSXP))[0];
R_len_t *buf = (int *) R_alloc(rows, sizeof(int)), b = 0; //dynamic allocation
//of a pointer to store
//the indices of rows
//that match the criterion.
//a classic usage of this
//is in do_which (summary.c)
double *pmat = REAL(mat); //pointer to the matrix input
for(int ir = 0; ir < rows; ir++) {
R_len_t COUNT_CONS_NAS = 0;
for(int ic = 0; ic < cols; ic++) {
if(ISNAN(pmat[ir + ic*rows])) { //if NA is found
COUNT_CONS_NAS++; //start counting NAs
if(COUNT_CONS_NAS == n) break; //no need to search all columns
}
else {
COUNT_CONS_NAS = 0; //if not NA, counter back to zero
}
}
if(COUNT_CONS_NAS == n) { //if the specific row matched the criterion
buf[b] = ir + 1; //store its index
b++;
}
}
PROTECT(ans = allocVector(INTSXP, b)); //allocate a vector with
//length = No rows that matched criterion
memcpy(INTEGER(ans), buf, sizeof(int)*b); //copy rows indices to
//the pointer of ans
UNPROTECT(3);
return(ans);
')
set.seed(11);mat = matrix(sample(c(NA, 0:2), 30, T, prob = c(0.7, 0.1, 0.1, 0.1)), 6)
mat
# [,1] [,2] [,3] [,4] [,5]
#[1,] NA NA 0 NA NA
#[2,] NA NA 2 NA NA
#[3,] NA 2 1 NA NA
#[4,] NA NA NA NA NA
#[5,] NA NA NA NA NA
#[6,] 0 NA NA NA NA
ff(mat, 3)
#[1] 4 5 6
mat[-ff(mat, 3),]
# [,1] [,2] [,3] [,4] [,5]
#[1,] NA NA 0 NA NA
#[2,] NA NA 2 NA NA
#[3,] NA 2 1 NA NA
还有一些基准测试:
#library(Rcpp) ; sourceCpp("~/ffcpp.cpp")
identical(dat[!cons.missings(dat,3), ], dat[cons_missings(is.na(dat),3), ])
#[1] TRUE
identical(dat[!cons.missings(dat,3), ], dat[-ff(dat, 4), ])
#[1] TRUE
library(microbenchmark)
microbenchmark(dat[!cons.missings(dat,3), ],
dat[cons_missings(is.na(dat),3), ],
dat[-ff(dat, 4), ], times = 10)
#Unit: milliseconds
expr min lq median uq max neval
#dat[!cons.missings(dat, 3), ] 3628.960362 3674.531704 3777.270890 3840.79075 3885.58709 10
#dat[cons_missings(is.na(dat), 3), ] 5256.550903 5267.137257 5325.497516 5365.13947 5554.88023 10
#dat[-ff(dat, 4), ] 6.444897 7.749669 8.971304 11.35649 58.94499 10
#the rows that each function will remove
resff <- ff(dat, 4)
rescons.mis <- which(cons.missings(dat,3))
rescons_mis <- seq_len(nrow(dat))[-cons_missings(is.na(dat),3)]
sum(resff != rescons.mis)
#[1] 0
sum(resff != rescons_mis)
#[1] 0
sum(rescons_mis != rescons.mis)
#[1] 0
length(resff)
#[1] 5671
length(rescons.mis)
#[1] 5671
length(rescons_mis)
#[1] 5671
答案 2 :(得分:0)
我使用Rcpp添加另一个答案,因为OP使用的是大矩阵。我不是Rcpp精通,所以即使我认为我试图实现有效的rle missings算法,我也无法获得更好的解决方案。
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
bool maxMissingSequence(IntegerVector x,int n) {
// Initialise first value
int lmissings = 1;
double prev = x[0];
for(IntegerVector::iterator it = x.begin() + 1; it != x.end(); ++it) {
if (prev == *it && prev==1)lmissings++;
if(lmissings >n) break; // we are OK
if(*it==0) lmissings =1; // reset counter
prev = *it;
}
return lmissings >n;
}
// [[Rcpp::export]]
IntegerVector cons_missings(IntegerMatrix Im, int n ){
IntegerVector res ;
int nrows = Im.nrow();
for (int i = 0; i < nrows; i++)
if(!maxMissingSequence(Im(i,_),n))
res.push_back(i+1);
return res;
}
set.seed(2)
N <- 3*1e5
dat <- matrix(sample(c(1,NA),N,replace=TRUE),ncol=5)
cons.missings <-
function(dat,n)
apply(is.na(dat),1,function(x){
yy <- rle(x)
any(yy$lengths[yy$values]>n)
})
identical(dat[!cons.missings(dat,3),],dat[cons_missings(is.na(dat),3),])
[1] TRUE
system.time(dat[!cons.missings(dat,3),])
user system elapsed
4.24 0.02 4.35
> system.time(dat[cons_missings(is.na(dat),3),])
user system elapsed
6.34 0.00 6.48