简单的问题,但我无法弄清楚如何执行以下操作。这是我的数据:
ID Time1 Time2 Time3 Time4
01 23 23 NA NA
02 21 21 21 NA
03 22 22 25 NA
04 29 29 20 NA
05 NA NA 15 22
06 NA NA 11 NA
现在,我想用其他变量中可用的数据替换缺失值(NA)。重要的是,我需要r来获取最接近'的值。到了缺失的数据点。例如,对于ID 5,Time1和Time2应为" 15" (不是" 22")。
像这样:
ID Time1 Time2 Time3 Time4
01 23 23 23 23
02 21 21 21 21
03 22 22 25 25
04 29 29 20 20
05 15 15 15 22
06 11 11 11 11
我已经尝试了ifelse声明,但这没有成功。
谢谢!
答案 0 :(得分:4)
使用data.table的滚动连接和set
:
library(data.table)
good = as.data.table( which(!is.na(df[-1]), arr.ind = TRUE) )
all = CJ(row = seq(nrow(df)), col = seq(2L, ncol(df)))
good$col = good$col + 1L
good$col_src = good$col
changes = good[all, on = c("row", "col"), roll="nearest"][ col != col_src ]
changes[, {
set(df, i = row, j = col, value = df[[ col_src ]][row])
NULL
}, by=.(col,col_src)]
# based on input from bgoldst's answer
ID 1 2 3 4
1: 01 23 23 23 23
2: 02 21 21 21 21
3: 03 22 22 25 25
4: 04 NA NA NA NA
5: 05 29 29 20 20
6: 06 15 15 15 22
7: 07 11 11 11 11
8: 08 1 1 2 2
我们找到要切换的所有条目,然后通过set
引用进行修改。我不确定roll="nearest"
如何处理关系,但我确信可以调整。
答案 1 :(得分:3)
它看起来要困难得多。我构建了一个解决方案,一次只能处理一列,获取所有时间列索引与当前列索引之间绝对距离的pmin()
,使用na.rm=T
参数剥离NA。然后,可以使用索引矩阵将结果用于索引原始时间列,然后可以将其分配给目标data.frame中的当前列索引。
这种设计的一个优点是它可以在行上完全矢量化。换句话说,它不会一次迭代一行。对于极其重的输入,这可能是一个优势。另一方面,该解决方案确实涉及构建与所有时间列(timemat
,nacols
和off
)并行的矩阵,这对于大型输入而言可能很昂贵。它基本上是通过交换内存来节省CPU。
我添加了几行来测试OP的示例data.frame未涵盖的其他案例;具体而言(1)全NA行,以及(2)NA值两侧具有候选非NA值的行。
输入:
df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 NA NA
## 2 02 21 21 21 NA
## 3 03 22 22 25 NA
## 4 04 NA NA NA NA
## 5 05 29 29 20 NA
## 6 06 NA NA 15 22
## 7 07 NA NA 11 NA
## 8 08 1 NA NA 2
解决方案:
ris <- seq_len(nrow(df));
cis <- grep('^Time',names(df));
timemat <- as.matrix(df[cis]);
nacols <- as.data.frame(ifelse(is.na(timemat),NA,col(timemat)));
nacols;
## Time1 Time2 Time3 Time4
## 1 1 2 NA NA
## 2 1 2 3 NA
## 3 1 2 3 NA
## 4 NA NA NA NA
## 5 1 2 3 NA
## 6 NA NA 3 4
## 7 NA NA 3 NA
## 8 1 NA NA 4
for (ci in seq_len(ncol(timemat))) {
off <- abs(nacols-ci);
best <- which(off==do.call(pmin,c(off,na.rm=T)),arr.ind=T);
df[cis[ci]] <- timemat[matrix(c(ris,best[match(ris,best[,'row']),'col']),nrow(df))];
};
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 23 23
## 2 02 21 21 21 21
## 3 03 22 22 25 25
## 4 04 NA NA NA NA
## 5 05 29 29 20 20
## 6 06 15 15 15 22
## 7 07 11 11 11 11
## 8 08 1 1 2 2
Rcpp解决方案:
library(Rcpp);
cppFunction('
IntegerMatrix fillDFNAsWithNearestInRow(DataFrame df, IntegerVector cis ) {
IntegerMatrix res(df.nrows(),cis.size());
if (df.nrows()==0 || cis.size()==0) return res;
IntegerVector cis0 = clone(cis); for (int cisi = 0; cisi < cis0.size(); ++cisi) --cis0[cisi]; // correct from R 1-based to Rcpp 0-based
for (int cisi = 0; cisi < cis0.size(); ++cisi) {
IntegerVector colCur = df[cis0[cisi]];
for (int ri = 0; ri < colCur.size(); ++ri) {
if (!IntegerVector::is_na(colCur[ri])) {
res(ri,cisi) = colCur[ri];
continue;
}
int leftOk;
int rightOk;
IntegerVector colLeft;
IntegerVector colRight;
bool set = false; // assumption
for (int off = 1; (leftOk = cisi-off>=0, rightOk = cisi+off<cis0.size(), leftOk ) || rightOk; ++off) {
if (leftOk && (colLeft = df[cis0[cisi-off]], !IntegerVector::is_na(colLeft[ri]))) {
res(ri,cisi) = colLeft[ri];
set = true;
break;
} else if (rightOk && (colRight = df[cis0[cisi+off]], !IntegerVector::is_na(colRight[ri]))) {
res(ri,cisi) = colRight[ri];
set = true;
break;
}
}
if (!set) res(ri,cisi) = NA_INTEGER;
}
}
return res;
}
');
df <- data.frame(ID=c('01','02','03','04','05','06','07','08'),Time1=c(23L,21L,22L,NA,29L,NA,NA,1L),Time2=c(23L,21L,22L,NA,29L,NA,NA,NA),Time3=c(NA,21L,25L,NA,20L,15L,11L,NA),Time4=c(NA,NA,NA,NA,NA,22L,NA,2L),stringsAsFactors=F);
cis <- grep('^Time',names(df));
df[cis] <- fillDFNAsWithNearestInRow(df,cis);
df;
## ID Time1 Time2 Time3 Time4
## 1 01 23 23 23 23
## 2 02 21 21 21 21
## 3 03 22 22 25 25
## 4 04 NA NA NA NA
## 5 05 29 29 20 20
## 6 06 15 15 15 22
## 7 07 11 11 11 11
## 8 08 1 1 2 2
答案 2 :(得分:3)
又一次尝试。尽可能地分解:(1)从左到右循环一次,向前携带最后一个非NA值,并且还记录非NA被替换为每个NA的位置,(2)从右到左再次循环( a)向后替换携带非NA的NA,以及(b)将每个NA替换为非NA的非NA的距离与当前的非NA进行比较并保持或替换。尽管有两个显式循环,但计算涉及length == nrow(x)
。
ff = function(x)
{
taken_from = lapply(seq_along(x), rep_len, nrow(x))
nas = lapply(x, is.na)
#loop left -> right
# carry forward non-NAs and record which non-NA replaced NA
last_nona = !nas[[1L]]
for(j in 2:length(x)) {
i = which(nas[[j]] & last_nona)
x[[j]][i] = x[[j - 1L]][i]
taken_from[[j]][i] = taken_from[[j - 1L]][i]
last_nona = !is.na(x[[j]])
}
#loop right -> left
#if NA and not replace carry the previous non-NA backward
#else compare which non-NA is nearer and replace appropriately
last_nona = !nas[[length(x)]]
for(j in (length(x) - 1L):1L) {
i1 = which(nas[[j]] & last_nona)
i = i1[(j - taken_from[[j]][i1]) > (taken_from[[(j + 1L)]][i1] - j)]
ii = i1[j == taken_from[[j]][i1]]
taken_from[[j]][i] = taken_from[[j + 1L]][i]
taken_from[[j]][ii] = taken_from[[j + 1L]][ii]
x[[j]][i] = x[[j + 1L]][i]
x[[j]][ii] = x[[j + 1L]][ii]
last_nona = !is.na(x[[j]])
}
return(x)
}
使用bgoldst的数据:
ff(df[-1L])
# Time1 Time2 Time3 Time4
#1 23 23 23 23
#2 21 21 21 21
#3 22 22 25 25
#4 NA NA NA NA
#5 29 29 20 20
#6 15 15 15 22
#7 11 11 11 11
#8 1 1 2 2
以及必要的基准测试:
set.seed(911)
DAT = as.data.frame(matrix(sample(c(NA, 0:10), 1e7, TRUE), 1e6, 10))
system.time({ ansff = ff(DAT) })
# user system elapsed
# 0.82 0.38 1.75
system.time({ ansbgoldst1 = bgoldst1(DAT) })
# user system elapsed
# 20.96 7.53 42.04
system.time({ ansbgoldst2 = bgoldst2(DAT) })
# user system elapsed
# 0.97 0.25 1.64
sf1 = system.time({ ansfrank = frank(DAT) }); sf2 = system.time( copy(DAT) )
sf1 - sf2
# user system elapsed
# 5.84 1.46 8.59
all.equal(ansff, ansbgoldst1)
#[1] TRUE
all.equal(ansbgoldst1, ansbgoldst2)
#[1] TRUE
all.equal(ansbgoldst2, ansfrank)
#[1] TRUE
功能:
bgoldst1 = function(x)
{
ris = seq_len(nrow(x))
xm = as.matrix(x)
nacols = as.data.frame(lapply(seq_along(x), function(i) { x[[i]][!is.na(x[[i]])] = i; x[[i]] }))
for(ci in seq_along(x)) {
off = abs(nacols - ci)
best = which(off == do.call(pmin, c(off, na.rm = TRUE)), arr.ind = TRUE)
x[ci] = xm[matrix(c(ris, best[match(ris, best[, "row"]), "col"]), nrow(x))]
}
x
}
bgoldst2 = function(x)
{
ans = as.data.frame(fillDFNAsWithNearestInRow(x, seq_along(x)))
names(ans) = names(x)
ans
}
frank = function(x)
{
x = copy(x)
good = as.data.table(which(!is.na(x), arr.ind = TRUE))
all = CJ(row = seq_len(nrow(x)), col = seq_len(ncol(x)))
good$col = good$col
good$col_src = good$col
changes = good[all, on = c("row", "col"), roll = "nearest"][col != col_src]
changes[, {
set(x, i = row, j = col, value = x[[col_src]][row])
NULL
}, by = .(col, col_src)]
x
}
答案 3 :(得分:1)
这是一个简单的解决方案:
x <-read.table(text="ID Time1 Time2 Time3 Time4
01 23 23 NA NA
02 21 21 21 NA
03 22 22 25 NA
04 29 29 20 NA
05 NA NA 15 22
06 NA NA 11 NA", header=TRUE)
x <- as.matrix(x[,-1])
dofill <- function(r){
PREV <- c(NA, suppressWarnings(head(r, -1)))
NEXT <- c(tail(r,-1), NA)
r[is.na(r)] <- PREV[is.na(r)]
r[is.na(r)] <- NEXT[is.na(r)]
r
}
rlefill <- function(r){
r[is.na(r)] <- "NA"
rle1 <- rle(r)
rle1$values <- dofill(as.numeric(rle1$values))
inverse.rle(rle1)
}
t(apply(x, 1, rlefill))
dofill
只是将所有NA替换为前一个值,并将剩余的NA替换为下一个值。
rlefill
。
当然,如果你有更多的时间点,你可能需要像...这样的东西。
cis <- grep('^Time',names(df))
timemat <- as.matrix(df[cis]);
...即从数据框中提取相关部分的更通用的解决方案。
(现在我意识到这并不是你提出的问题 - 我的解决方案总是使用前面的值,如果它可用,即使以下值更接近时间。它对你的示例数据集没有影响但它可能会对实际数据产生影响。)