我有一个非常大的data.frame
,其中有几个NA
值。如果许多NA
值连续出现,似乎会出现问题。
有没有一种简单的方法可以找到那些行中,NA
值连续出现20次,但不包括20个NA
值孤立出现的行?
编辑(由agstudy添加)
接受的解决方案使用apply
,对于巨大的矩阵来说效率不是很高。因此,我编辑了解决方案(我添加了Rcpp
标签)以寻求更加高效的解决方案。
complete.cases
的函数,使用 rle
计算连续缺失值:cons.missings <-
function(dat,n)
apply(is.na(dat),1,function(x){
yy <- rle(x)
any(yy$lengths[yy$values]>n)
})
然后只保留好行:
dat[!cons.missings(dat,20),]
有4个连接器缺失值的示例:
dat <- as.matrix(t(data.frame(a= c(1,rep(NA,4),5),
b= c(2,rep(NA,2),1,rep(NA,2)))))
[,1] [,2] [,3] [,4] [,5] [,6]
a 1 NA NA NA NA 5
b 2 NA NA 1 NA NA
dat[!cons.missings(dat,3),]
[1] 2 NA NA 1 NA NA
.Call
的替代方法,似乎是有效的:library(inline)
ff = cfunction(sig = c(R_mat = "matrix", R_n = "numeric"), body = '
SEXP mat, dims, ans;
PROTECT(mat = coerceVector(R_mat, REALSXP)); //or `as.numeric(.)` in R
PROTECT(dims = getAttrib(R_mat, R_DimSymbol));
R_len_t rows = INTEGER(dims)[0], cols = INTEGER(dims)[1];
R_len_t n = INTEGER(coerceVector(R_n, INTSXP))[0];
R_len_t *buf = (int *) R_alloc(rows, sizeof(int)), b = 0; //dynamic allocation
//of a pointer to store
//the indices of rows
//that match the criterion.
//a classic usage of this
//is in do_which (summary.c)
double *pmat = REAL(mat); //pointer to the matrix input
for(int ir = 0; ir < rows; ir++) {
R_len_t COUNT_CONS_NAS = 0;
for(int ic = 0; ic < cols; ic++) {
if(ISNAN(pmat[ir + ic*rows])) { //if NA is found
COUNT_CONS_NAS++; //start counting NAs
if(COUNT_CONS_NAS == n) break; //no need to search all columns
}
else {
COUNT_CONS_NAS = 0; //if not NA, counter back to zero
}
}
if(COUNT_CONS_NAS == n) { //if the specific row matched the criterion
buf[b] = ir + 1; //store its index
b++;
}
}
PROTECT(ans = allocVector(INTSXP, b)); //allocate a vector with
//length = No rows that matched criterion
memcpy(INTEGER(ans), buf, sizeof(int)*b); //copy rows indices to
//the pointer of ans
UNPROTECT(3);
return(ans);
')
set.seed(11);mat = matrix(sample(c(NA, 0:2), 30, T, prob = c(0.7, 0.1, 0.1, 0.1)), 6)
mat
# [,1] [,2] [,3] [,4] [,5]
#[1,] NA NA 0 NA NA
#[2,] NA NA 2 NA NA
#[3,] NA 2 1 NA NA
#[4,] NA NA NA NA NA
#[5,] NA NA NA NA NA
#[6,] 0 NA NA NA NA
ff(mat, 3)
#[1] 4 5 6
mat[-ff(mat, 3),]
# [,1] [,2] [,3] [,4] [,5]
#[1,] NA NA 0 NA NA
#[2,] NA NA 2 NA NA
#[3,] NA 2 1 NA NA
以下是一些基准测试结果:
#library(Rcpp) ; sourceCpp("~/ffcpp.cpp")
identical(dat[!cons.missings(dat,3), ], dat[cons_missings(is.na(dat),3), ])
#[1] TRUE
identical(dat[!cons.missings(dat,3), ], dat[-ff(dat, 4), ])
#[1] TRUE
library(microbenchmark)
microbenchmark(dat[!cons.missings(dat,3), ],
dat[cons_missings(is.na(dat),3), ],
dat[-ff(dat, 4), ], times = 10)
#Unit: milliseconds
expr min lq median uq max neval
#dat[!cons.missings(dat, 3), ] 3628.960362 3674.531704 3777.270890 3840.79075 3885.58709 10
#dat[cons_missings(is.na(dat), 3), ] 5256.550903 5267.137257 5325.497516 5365.13947 5554.88023 10
#dat[-ff(dat, 4), ] 6.444897 7.749669 8.971304 11.35649 58.94499 10
#the rows that each function will remove
resff <- ff(dat, 4)
rescons.mis <- which(cons.missings(dat,3))
rescons_mis <- seq_len(nrow(dat))[-cons_missings(is.na(dat),3)]
sum(resff != rescons.mis)
#[1] 0
sum(resff != rescons_mis)
#[1] 0
sum(rescons_mis != rescons.mis)
#[1] 0
length(resff)
#[1] 5671
length(rescons.mis)
#[1] 5671
length(rescons_mis)
#[1] 5671
我使用Rcpp添加了另一个答案,因为OP正在使用大矩阵。虽然我不是Rcpp专家,但我尝试实现了一种高效的rle missings算法,即使我认为我没有得到更好的解决方案。
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
bool maxMissingSequence(IntegerVector x,int n) {
// Initialise first value
int lmissings = 1;
double prev = x[0];
for(IntegerVector::iterator it = x.begin() + 1; it != x.end(); ++it) {
if (prev == *it && prev==1)lmissings++;
if(lmissings >n) break; // we are OK
if(*it==0) lmissings =1; // reset counter
prev = *it;
}
return lmissings >n;
}
// [[Rcpp::export]]
IntegerVector cons_missings(IntegerMatrix Im, int n ){
IntegerVector res ;
int nrows = Im.nrow();
for (int i = 0; i < nrows; i++)
if(!maxMissingSequence(Im(i,_),n))
res.push_back(i+1);
return res;
}
set.seed(2)
N <- 3*1e5
dat <- matrix(sample(c(1,NA),N,replace=TRUE),ncol=5)
cons.missings <-
function(dat,n)
apply(is.na(dat),1,function(x){
yy <- rle(x)
any(yy$lengths[yy$values]>n)
})
identical(dat[!cons.missings(dat,3),],dat[cons_missings(is.na(dat),3),])
[1] TRUE
system.time(dat[!cons.missings(dat,3),])
user system elapsed
4.24 0.02 4.35
> system.time(dat[cons_missings(is.na(dat),3),])
user system elapsed
6.34 0.00 6.48
try-catch
语句,以便“坏”行不会导致函数崩溃。 - Carl Witthoft