使用预先存在的缺失随机降低数据

时间:2013-09-16 21:49:13

标签: r missing-data

我研究缺失并且我试图在R中现有缺失的基础上随机降级数据集。基本上我想要做的是随机选择我的数据框中的位置并将它们更改为NA但仅当它们不是NA时。我已经编写了下面的函数,但是在较大的数据帧上它实际上很慢,甚至在运行了几个小时后它也没有生成解决方案。有没有人有更好的方法来随机降级数据(使用NA检查)或知道一种方法来提高我的功能效率? 非常感谢你!

      degradefunction<- function(x,nrow1, ncol1,del.amount){ 
repeat{
#Generate random row numbers
      rowsample<-sample(1:nrow1,del.amount,replace=TRUE)
#Generate random column numbers
      colsample<-sample(1:ncol1, del.amount, replace=TRUE)
      x.del<-x
#For each deletion, with a randomly selected row and column location
      for (i in 1:del.amount){ 
#Test if the randomly selected location is not an NA
      if(is.na(x.del[rowsample[i],colsample[i]])==FALSE)
#If it is not NA but is a number make it NA now
      {x.del[rowsample[i],colsample[i]]<-NA} 
#If our randomly selected location IS an NA
      else {   
#Get a new randomly selected location (row and column)
     resamplenrow1<-sample(1:nrow1,1,replace=TRUE)
     resamplencol1<-sample(1:ncol1,1,replace=TRUE)
#See if the new location is an NA 
     if(is.na(x.del[resamplenrow1,resamplencol1])==FALSE){
#If the randomly selected location is not an NA, make it an NA
     x.del[resamplenrow1,resamplencol1]<-NA}}}
#Break the loop when the amount of data in the dataframe has the number of 
#missing values we want (in a case starting with 0 NAs we want it to break 
#when the number of NAs in x.del is equal to the del.amount, but when we 
#do this with data including missingness we want the number of missing 
#values to equal the deletion amount plus the number of NAs previously 
#occurring in the data.
         if(sum(is.na(x.del))==(del.amount + (sum(is.na(x))))){break}
         }
#Give back the degraded data
         return(x.del)
         }
#Test the degrade function
#On data with 0 previous missingness
        mypractice<-matrix(c(1,2,3, 4), nrow=10, ncol=4)
        mypractice
        mypractice<-as.data.frame(mypractice)
        str(mypractice)
        sum(is.na(mypractice))
        newvarx<-degradefunction(mypractice, 10,4,16)
        newvarx
        sum(is.na(newvarx))
#The sum of missingness is now 16
#On data with previous missingness
        mypractice<-matrix(c(1,2,3,NA), nrow=10, ncol=4)
        mypractice
        mypractice<-as.data.frame(mypractice)
        str(mypractice)
        sum(is.na(mypractice))
        newvarx<-degradefunction(mypractice, 10,4,16)
        newvarx
        sum(is.na(newvarx))
#We get a total missingness (26) of the missingness we induced (16) 
#and the missingness in the original data (10)

1 个答案:

答案 0 :(得分:2)

这个怎么样?

degradefunction <- function(x, del.amount){
  # 1) indicate which cells are NA (works with matrix or df)
  preNAs     <- is.na(x)
  # 2) how many cells are eligible to be degraded?
  OpenSpots  <- prod(dim(x)) - sum(preNAs)
  # 3) of these, select del.amount for replacement with NA
  newNas     <- sample(1:OpenSpots, size = del.amount, replace = FALSE)
  # 4) impute these NAs, ignoring the original NAs
  x[!preNAs][newNas] <- NA
  x
}
degradefunction(mypractice,16)