我必须通过ID将缺失值替换为最大值(Value)。如何做R
ID Value
1 NA
5 15
8 16
6 8
7 65
8 NA
5 25
1 62
6 14
7 NA
9 11
8 12
9 36
1 26
4 13
答案 0 :(得分:2)
我首先使用aggregate()
调用预先计算最大值,并预先计算data.frame的哪些行具有NA值。然后,您可以将ID与聚合表进行匹配,以提取相应的最大值。
maxes <- aggregate(Value~ID,df,max,na.rm=T);
nas <- which(is.na(df$Value));
df$Value[nas] <- maxes$Value[match(df$ID[nas],maxes$ID)];
df;
## ID Value
## 1 1 62
## 2 5 15
## 3 8 16
## 4 6 8
## 5 7 65
## 6 8 16
## 7 5 25
## 8 1 62
## 9 6 14
## 10 7 65
## 11 9 11
## 12 8 12
## 13 9 36
## 14 1 26
## 15 4 13
替代方案,使用ave()
:
df$Value <- ave(df$Value,df$ID,FUN=function(x) { x[is.na(x)] <- max(x,na.rm=T); x; });
df;
## ID Value
## 1 1 62
## 2 5 15
## 3 8 16
## 4 6 8
## 5 7 65
## 6 8 16
## 7 5 25
## 8 1 62
## 9 6 14
## 10 7 65
## 11 9 11
## 12 8 12
## 13 9 36
## 14 1 26
## 15 4 13
数据强>
df <- data.frame(ID=c(1L,5L,8L,6L,7L,8L,5L,1L,6L,7L,9L,8L,9L,1L,4L),Value=c(NA,15L,16L,8L,
65L,NA,25L,62L,14L,NA,11L,12L,36L,26L,13L));
注意:
bgoldst2()
,rafa()
和akrun()
,以防止群组中出现非零N的情况;否则,max(...,na.rm=T)
会返回-Inf
,这会导致后续操作陷入困境。我为所有三名守卫使用了相同的算法。 thierry()
和bgoldst1()
无需修改。res
显示了两个参数,所有解决方案的平均运行时间以及微基准总结算法选择的单位。library(microbenchmark);
library(dplyr);
library(data.table);
library(zoo);
thierry <- function(df) df %>% group_by(ID) %>% mutate(Value = ifelse(is.na(Value), max(Value, na.rm = TRUE), Value));
bgoldst1 <- function(df) { maxes <- aggregate(Value~ID,df,max,na.rm=T); nas <- which(is.na(df$Value)); df$Value[nas] <- maxes$Value[match(df$ID[nas],maxes$ID)]; df; };
bgoldst2 <- function(df) { df$Value <- ave(df$Value,df$ID,FUN=function(x) { nas <- is.na(x); if (any(!nas) && any(nas)) x[nas] <- max(x,na.rm=T); x; }); df; };
rafa <- function(dt) dt[ , Value := { nas <- is.na(Value); if (any(!nas) && any(nas)) ifelse( nas, max(Value, na.rm=T), Value) else Value; }, by = ID];
akrun <- function(dt) dt[, Value := { nas <- is.na(Value); if (any(!nas) && any(nas)) na.aggregate(Value, FUN = max) else Value; }, ID];
## small scale (OP's sample input)
df <- data.frame(ID=c(1L,5L,8L,6L,7L,8L,5L,1L,6L,7L,9L,8L,9L,1L,4L),Value=c(NA,15L,16L,8L,65L,NA,25L,62L,14L,NA,11L,12L,36L,26L,13L));
dt <- as.data.table(df);
ex <- as.data.frame(thierry(copy(df)));
identical(ex,bgoldst1(copy(df)));
identical(ex,bgoldst2(copy(df)));
identical(ex,as.data.frame(rafa(copy(dt))));
identical(ex,as.data.frame(akrun(copy(dt))));
microbenchmark(thierry(copy(df)),bgoldst1(copy(df)),bgoldst2(copy(df)),rafa(copy(dt)),akrun(copy(dt)));
## Unit: microseconds
## expr min lq mean median uq max neval
## thierry(copy(df)) 955.804 989.1610 1043.2847 1004.984 1044.542 2852.016 100
## bgoldst1(copy(df)) 953.238 1005.1985 1069.6281 1039.410 1075.760 2968.337 100
## bgoldst2(copy(df)) 160.798 181.9665 196.0281 192.872 207.412 246.329 100
## rafa(copy(dt)) 947.679 1006.6945 1056.9396 1033.637 1055.874 2943.105 100
## akrun(copy(dt)) 1327.862 1384.5255 1496.1259 1415.530 1445.894 3969.899 100
## large scale, 3 group sizes crossed with 4 NA densities
NV <- 1e5L;
NIs <- c(10L,1e3L,3e4L);
probNAs <- c(1e-3,0.05,0.4,0.95);
res <- expand.grid(NI=NIs,probNA=probNAs);
system.time({
for (ri in seq_len(nrow(res))) {
NI <- res$NI[ri];
probNA <- res$probNA[ri];
df <- data.frame(ID=sample(seq_len(NI),NV,T),Value=sample(c(NA,1:99),NV,T,c(probNA,rep((1-probNA)/99,99L))));
dt <- as.data.table(df);
ex <- as.data.frame(thierry(copy(df)));
if (!all(c(
identical(ex,bgoldst1(copy(df))),
identical(ex,bgoldst2(copy(df))),
identical(ex,as.data.frame(rafa(copy(dt)))),
identical(ex,as.data.frame(akrun(copy(dt))))
))) stop('non-identical failure.');
bm <- summary(microbenchmark(thierry(copy(df)),bgoldst1(copy(df)),bgoldst2(copy(df)),rafa(copy(dt)),akrun(copy(dt)),times=5L));
nms <- sub('\\(.*','',as.character(bm$expr));
for (nm in nms) if (!nm%in%names(res)) res[[nm]] <- NA_real_;
if (!'unit'%in%names(res)) res$unit <- NA_character_;
res[ri,nms] <- bm$mean;
res$unit[ri] <- attr(bm,'unit');
}; ## end for
});
## user system elapsed
## 73.18 0.00 73.37
res;
## NI probNA thierry bgoldst1 bgoldst2 rafa akrun unit
## 1 10 0.001 7.850589 138.77128 14.867427 7.071150 8.023874 milliseconds
## 2 1000 0.001 40.318311 177.26223 9.868853 6.389129 18.054122 milliseconds
## 3 30000 0.001 813.204627 619.16166 125.274735 57.301590 74.732023 milliseconds
## 4 10 0.050 9.387743 139.41686 15.032158 8.479837 6.933616 milliseconds
## 5 1000 0.050 43.223697 156.79871 23.377797 20.550586 145.632279 milliseconds
## 6 30000 0.050 822.338773 677.81813 129.268155 114.585475 656.468438 milliseconds
## 7 10 0.400 15.955374 110.20717 9.785802 11.832889 10.511871 milliseconds
## 8 1000 0.400 55.858348 115.93900 14.441228 22.525058 142.740834 milliseconds
## 9 30000 0.400 853.571520 521.19690 147.925864 208.278328 2518.672465 milliseconds
## 10 10 0.950 9.768268 43.98346 5.921021 9.895623 8.571868 milliseconds
## 11 1000 0.950 49.228024 63.72596 13.702929 22.152230 143.606916 milliseconds
## 12 30000 0.950 822.033257 103.91700 113.398739 86.240922 630.982913 milliseconds
答案 1 :(得分:1)
library(dplyr)
dataset %>%
group_by(ID) %>%
mutate(
Value = ifelse(
is.na(Value),
max(Value, na.rm = TRUE),
Value
)
)
答案 2 :(得分:1)
使用data.table
的简单快速解决方案。感谢@bgoldst提供na.rm=T
的提示。
library(data.table)
setDT(df)[ , Value := ifelse( is.na(Value), max(Value, na.rm=T), Value), by = ID]
答案 3 :(得分:1)
我们可以将na.aggregate
与data.table
library(data.table)
library(zoo)
setDT(df)[, Value := na.aggregate(Value, FUN = max) , by = ID]
df
# ID Value
# 1: 1 62
# 2: 5 15
# 3: 8 16
# 4: 6 8
# 5: 7 65
# 6: 8 16
# 7: 5 25
# 8: 1 62
# 9: 6 14
#10: 7 65
#11: 9 11
#12: 8 12
#13: 9 36
#14: 1 26
#15: 4 13