r - 从其他变量中复制缺失值

时间:2016-04-19 16:32:32

标签: r missing-data

简单的问题,但我无法弄清楚如何执行以下操作。这是我的数据:

ID    Time1    Time2    Time3    Time4
01    23       23       NA       NA  
02    21       21       21       NA
03    22       22       25       NA
04    29       29       20       NA
05    NA       NA       15       22
06    NA       NA       11       NA

现在,我想用其他变量中可用的数据替换缺失值(NA)。重要的是,我需要r来获取最接近'的值。到了缺失的数据点。例如,对于ID 5,Time1和Time2应为" 15" (不是" 22")。

像这样:

ID    Time1    Time2    Time3    Time4
01    23       23       23       23  
02    21       21       21       21
03    22       22       25       25
04    29       29       20       20
05    15       15       15       22
06    11       11       11       11

我已经尝试了ifelse声明,但这没有成功。

谢谢!

4 个答案:

答案 0 :(得分:4)

使用data.table的滚动连接和set

library(data.table)
good = as.data.table( which(!is.na(df[-1]), arr.ind = TRUE) )
all = CJ(row = seq(nrow(df)), col = seq(2L, ncol(df)))
good$col = good$col + 1L
good$col_src = good$col

changes = good[all, on = c("row", "col"), roll="nearest"][ col != col_src ]

changes[, {
  set(df, i = row, j = col, value = df[[ col_src ]][row])
  NULL
}, by=.(col,col_src)]

# based on input from bgoldst's answer
   ID  1  2  3  4
1: 01 23 23 23 23
2: 02 21 21 21 21
3: 03 22 22 25 25
4: 04 NA NA NA NA
5: 05 29 29 20 20
6: 06 15 15 15 22
7: 07 11 11 11 11
8: 08  1  1  2  2

我们找到要切换的所有条目,然后通过set引用进行修改。我不确定roll="nearest"如何处理关系,但我确信可以调整。

答案 1 :(得分:3)

它看起来要困难得多。我构建了一个解决方案,一次只能处理一列,获取所有时间列索引与当前列索引之间绝对距离的pmin(),使用na.rm=T参数剥离NA。然后,可以使用索引矩阵将结果用于索引原始时间列,然后可以将其分配给目标data.frame中的当前列索引。

这种设计的一个优点是它可以在行上完全矢量化。换句话说,它不会一次迭代一行。对于极其重的输入,这可能是一个优势。另一方面,该解决方案确实涉及构建与所有时间列(timematnacolsoff)并行的矩阵,这对于大型输入而言可能很昂贵。它基本上是通过交换内存来节省CPU。

我添加了几行来测试OP的示例data.frame未涵盖的其他案例;具体而言(1)全NA行,以及(2)NA值两侧具有候选非NA值的行。

输入:

df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
df;
##   ID Time1 Time2 Time3 Time4
## 1 01    23    23    NA    NA
## 2 02    21    21    21    NA
## 3 03    22    22    25    NA
## 4 04    NA    NA    NA    NA
## 5 05    29    29    20    NA
## 6 06    NA    NA    15    22
## 7 07    NA    NA    11    NA
## 8 08     1    NA    NA     2

解决方案:

ris <- seq_len(nrow(df));
cis <- grep('^Time',names(df));
timemat <- as.matrix(df[cis]);
nacols <- as.data.frame(ifelse(is.na(timemat),NA,col(timemat)));
nacols;
##   Time1 Time2 Time3 Time4
## 1     1     2    NA    NA
## 2     1     2     3    NA
## 3     1     2     3    NA
## 4    NA    NA    NA    NA
## 5     1     2     3    NA
## 6    NA    NA     3     4
## 7    NA    NA     3    NA
## 8     1    NA    NA     4
for (ci in seq_len(ncol(timemat))) {
    off <- abs(nacols-ci);
    best <- which(off==do.call(pmin,c(off,na.rm=T)),arr.ind=T);
    df[cis[ci]] <- timemat[matrix(c(ris,best[match(ris,best[,'row']),'col']),nrow(df))];
};
df;
##   ID Time1 Time2 Time3 Time4
## 1 01    23    23    23    23
## 2 02    21    21    21    21
## 3 03    22    22    25    25
## 4 04    NA    NA    NA    NA
## 5 05    29    29    20    20
## 6 06    15    15    15    22
## 7 07    11    11    11    11
## 8 08     1     1     2     2

Rcpp解决方案:

library(Rcpp);
cppFunction('
    IntegerMatrix fillDFNAsWithNearestInRow(DataFrame df, IntegerVector cis ) {
        IntegerMatrix res(df.nrows(),cis.size());
        if (df.nrows()==0 || cis.size()==0) return res;
        IntegerVector cis0 = clone(cis); for (int cisi = 0; cisi < cis0.size(); ++cisi) --cis0[cisi]; // correct from R 1-based to Rcpp 0-based
        for (int cisi = 0; cisi < cis0.size(); ++cisi) {
            IntegerVector colCur = df[cis0[cisi]];
            for (int ri = 0; ri < colCur.size(); ++ri) {
                if (!IntegerVector::is_na(colCur[ri])) {
                    res(ri,cisi) = colCur[ri];
                    continue;
                }
                int leftOk;
                int rightOk;
                IntegerVector colLeft;
                IntegerVector colRight;
                bool set = false; // assumption
                for (int off = 1; (leftOk = cisi-off>=0, rightOk = cisi+off<cis0.size(), leftOk ) || rightOk; ++off) {
                    if (leftOk && (colLeft = df[cis0[cisi-off]], !IntegerVector::is_na(colLeft[ri]))) {
                        res(ri,cisi) = colLeft[ri];
                        set = true;
                        break;
                    } else if (rightOk && (colRight = df[cis0[cisi+off]], !IntegerVector::is_na(colRight[ri]))) {
                        res(ri,cisi) = colRight[ri];
                        set = true;
                        break;
                    }
                }
                if (!set) res(ri,cisi) = NA_INTEGER;
            }
        }
        return res;
    }
');
df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
cis <- grep('^Time',names(df));
df[cis] <- fillDFNAsWithNearestInRow(df,cis);
df;
##   ID Time1 Time2 Time3 Time4
## 1 01    23    23    23    23
## 2 02    21    21    21    21
## 3 03    22    22    25    25
## 4 04    NA    NA    NA    NA
## 5 05    29    29    20    20
## 6 06    15    15    15    22
## 7 07    11    11    11    11
## 8 08     1     1     2     2

答案 2 :(得分:3)

又一次尝试。尽可能地分解:(1)从左到右循环一次,向前携带最后一个非NA值,并且还记录非NA被替换为每个NA的位置,(2)从右到左再次循环( a)向后替换携带非NA的NA,以及(b)将每个NA替换为非NA的非NA的距离与当前的非NA进行比较并保持或替换。尽管有两个显式循环,但计算涉及length == nrow(x)

的向量
ff = function(x)
{    
    taken_from = lapply(seq_along(x), rep_len, nrow(x))
    nas = lapply(x, is.na)

    #loop left -> right 
    # carry forward non-NAs and record which non-NA replaced NA
    last_nona = !nas[[1L]]
    for(j in 2:length(x)) {
        i = which(nas[[j]] & last_nona)
        x[[j]][i] = x[[j - 1L]][i]
        taken_from[[j]][i] = taken_from[[j - 1L]][i]
        last_nona = !is.na(x[[j]])
    }

    #loop right -> left
    #if NA and not replace carry the previous non-NA backward
    #else compare which non-NA is nearer and replace appropriately
    last_nona = !nas[[length(x)]]
    for(j in (length(x) - 1L):1L) {
        i1 = which(nas[[j]] & last_nona)
        i = i1[(j - taken_from[[j]][i1]) > (taken_from[[(j + 1L)]][i1] - j)]
        ii = i1[j == taken_from[[j]][i1]]
        taken_from[[j]][i] = taken_from[[j + 1L]][i]
        taken_from[[j]][ii] = taken_from[[j + 1L]][ii]
        x[[j]][i] = x[[j + 1L]][i]
        x[[j]][ii] = x[[j + 1L]][ii]
        last_nona = !is.na(x[[j]])
    }

    return(x)
}

使用bgoldst的数据:

ff(df[-1L])
#  Time1 Time2 Time3 Time4
#1    23    23    23    23
#2    21    21    21    21
#3    22    22    25    25
#4    NA    NA    NA    NA
#5    29    29    20    20
#6    15    15    15    22
#7    11    11    11    11
#8     1     1     2     2

以及必要的基准测试:

set.seed(911)            
DAT = as.data.frame(matrix(sample(c(NA, 0:10), 1e7, TRUE), 1e6, 10))
system.time({ ansff = ff(DAT) })
#   user  system elapsed 
#   0.82    0.38    1.75 
system.time({ ansbgoldst1 = bgoldst1(DAT) })
#   user  system elapsed 
#  20.96    7.53   42.04 
system.time({ ansbgoldst2 = bgoldst2(DAT) })
#   user  system elapsed 
#   0.97    0.25    1.64 
sf1 = system.time({ ansfrank = frank(DAT) }); sf2 = system.time( copy(DAT) )
sf1 - sf2
#   user  system elapsed 
#   5.84    1.46    8.59 
all.equal(ansff, ansbgoldst1)
#[1] TRUE
all.equal(ansbgoldst1, ansbgoldst2)
#[1] TRUE
all.equal(ansbgoldst2, ansfrank)
#[1] TRUE

功能:

bgoldst1 = function(x)
{
    ris = seq_len(nrow(x))
    xm = as.matrix(x)
    nacols = as.data.frame(lapply(seq_along(x), function(i) { x[[i]][!is.na(x[[i]])] = i; x[[i]] }))
    for(ci in seq_along(x)) {
        off = abs(nacols - ci)
        best = which(off == do.call(pmin, c(off, na.rm = TRUE)), arr.ind = TRUE)
        x[ci] = xm[matrix(c(ris, best[match(ris, best[, "row"]), "col"]), nrow(x))]
    }
    x
}

bgoldst2 = function(x) 
{
    ans = as.data.frame(fillDFNAsWithNearestInRow(x, seq_along(x)))
    names(ans) = names(x)
    ans
}

frank = function(x)
{
    x = copy(x)
    good = as.data.table(which(!is.na(x), arr.ind = TRUE))
    all = CJ(row = seq_len(nrow(x)), col = seq_len(ncol(x)))
    good$col = good$col
    good$col_src = good$col

    changes = good[all, on = c("row", "col"), roll = "nearest"][col != col_src]

    changes[, {
            set(x, i = row, j = col, value = x[[col_src]][row])
            NULL
            }, by = .(col, col_src)]
    x
}

答案 3 :(得分:1)

这是一个简单的解决方案:

x <-read.table(text="ID    Time1    Time2    Time3    Time4
01    23       23       NA       NA  
02    21       21       21       NA
03    22       22       25       NA
04    29       29       20       NA
05    NA       NA       15       22
06    NA       NA       11       NA", header=TRUE)

x <- as.matrix(x[,-1])

dofill <- function(r){
  PREV <- c(NA, suppressWarnings(head(r, -1)))
  NEXT <- c(tail(r,-1), NA)
  r[is.na(r)] <- PREV[is.na(r)]
  r[is.na(r)] <- NEXT[is.na(r)]
  r
}

rlefill <- function(r){
  r[is.na(r)] <- "NA"
  rle1 <- rle(r)
  rle1$values <- dofill(as.numeric(rle1$values))
  inverse.rle(rle1)
}

t(apply(x, 1, rlefill))

dofill只是将所有NA替换为前一个值,并将剩余的NA替换为下一个值。

将一系列NA变换为“一个大NA”需要

rlefill

当然,如果你有更多的时间点,你可能需要像...这样的东西。

cis <- grep('^Time',names(df))
timemat <- as.matrix(df[cis]);

...即从数据框中提取相关部分的更通用的解决方案。

(现在我意识到这并不是你提出的问题 - 我的解决方案总是使用前面的值,如果它可用,即使以下值更接近时间。它对你的示例数据集没有影响但它可能会对实际数据产生影响。)