是否有合并函数优先考虑常见变量的非缺失值?
请考虑以下示例。
首先,我们生成两个具有相同ID但在特定变量上具有补充缺失值的data.frame:
set.seed(1)
missings <- sample.int(6, 3)
df1 <- data.frame(ID = letters[1:6], V1 = NA)
df2 <- data.frame(ID = letters[1:6], V1 = NA)
df1$V1[missings] <- rnorm(3)
df2$V1[setdiff(1:6, missings)] <- rnorm(3)
从merge
包中应用join
或任何dplyr
函数会产生类似于以下内容的结果:
> merge(df1, df2, by = 'ID')
ID V1.x V1.y
1 a NA -1.5399500
2 b 1.3297993 NA
3 c 0.4146414 NA
4 d NA -0.9285670
5 e NA -0.2947204
6 f 1.2724293 NA
我们希望以“更智能”的方式加入这两个data.frames,忽略一个data.frame中的缺失值,而另一个data.frame中没有丢失,以获得以下输出:
> output <- df1
> output$V1[is.na(df1$V1)] <- df2$V1[!(is.na(df2$V1))]
> output
ID V1
1 a -1.5399500
2 b 1.3297993
3 c 0.4146414
4 d -0.9285670
5 e -0.2947204
6 f 1.2724293
我们可以假设df1
和df2
具有完全互补的V1
缺失值。
适用于任意数量变量的解决方案将是理想的。
答案 0 :(得分:3)
感谢@Gregor和@StevenBeaupré的非常有用的评论,我提出了一个使用coalesce.na
包kimisc
的解决方案,该解决方案扩展到任意数量的变量:
mapply(function(x,y) coalesce.na(x,y), df1$V1, df2$V1)
[1] -1.5399500 1.3297993 0.4146414 -0.9285670 -0.2947204 1.2724293
请注意df1$V1
和df2$V1
可以替换变量列表,允许类似的内容:
> set.seed(1)
> missings <- sample.int(6, 3)
> df1 <- data.frame(ID = letters[1:6],
+ V1 = NA,
+ V2 = NA)
> df2 <- data.frame(ID = letters[1:6],
+ V1 = NA,
+ V2 = NA)
> df1$V1[missings] <- rnorm(3)
> df2$V1[setdiff(1:6, missings)] <- rnorm(3)
> df1$V2[setdiff(1:6, missings)] <- rnorm(3)
> df2$V2[missings] <- rnorm(3)
> cbind(df1, df2)
ID V1 V2 ID V1 V2
1 a NA -0.005767173 a -1.5399500 NA
2 b 1.3297993 NA b NA -0.7990092
3 c 0.4146414 NA c NA -0.2894616
4 d NA 2.404653389 d -0.9285670 NA
5 e NA 0.763593461 e -0.2947204 NA
6 f 1.2724293 NA f NA -1.1476570
> dfMerged <- merge(df1, df2, by = 'ID')
> xList <- dfMerged[grep("\\.x$", names(dfMerged))]
> yList <- dfMerged[grep("\\.y$", names(dfMerged))]
> mapply(function(x,y) coalesce.na(x,y), xList, yList)
V1.x V2.x
[1,] -1.5399500 -0.005767173
[2,] 1.3297993 -0.799009249
[3,] 0.4146414 -0.289461574
[4,] -0.9285670 2.404653389
[5,] -0.2947204 0.763593461
[6,] 1.2724293 -1.147657009
完整的解决方案看起来像是:
library(kimisc)
smartMergeList <- function(dfList, idVar) {
merged <- Reduce(x = dfList,
f = function(x,y) merge(x, y, by = idVar, all = T))
xList <- merged[grep("\\.x$", names(merged))]
yList <- merged[grep("\\.y$", names(merged))]
merged[names(xList)] <- mapply(function(x,y) coalesce.na(x,y),
xList, yList)
merged[names(yList)] <- NULL
merged
})
我希望看到更漂亮的东西!
答案 1 :(得分:2)
如果避免指定列是output$V1[is.na(df1$V1)] <- df2$V1[!(is.na(df2$V1))]
的唯一问题,那么您只需使用na.omit()
而不是单独指定变量。我们也可以仅针对重叠列执行此操作。
让我们修改原始数据,使列不是完美的重叠,额外的列有一些我们想要保留的数据:
set.seed(1)
missings <- sample.int(6, 3)
df1 <- data.frame(ID = letters[1:6], V1 = NA, V2 = c(NA, 2, 3, NA, 5, 6))
df2 <- data.frame(ID = letters[1:6], V1 = NA)
df1$V1[missings] <- rnorm(3)
df2$V1[setdiff(1:6, missings)] <- rnorm(3)
# now df1 looks like this:
df1
# ID V1 V2
# 1 a NA NA
# 2 b 1.3297993 2
# 3 c 0.4146414 3
# 4 d NA NA
# 5 e NA 5
# 6 f 1.2724293 6
common_cols = intersect(names(df1), names(df2))
result = na.omit(rbind(df1[common_cols], df2[common_cols]))
result = merge(result, df1, all.x = T)
result = merge(result, df2, all.x = T)
# the merges are only necessary if there are additional columns to pick up
result
# ID V1 V2
# 1 a -1.5399500 NA
# 2 b 1.3297993 2
# 3 c 0.4146414 3
# 4 d -0.9285670 NA
# 5 e -0.2947204 NA
# 6 f 1.2724293 6
答案 2 :(得分:1)
根据上面的讨论和答案,这是我使用dplyr
的方法。不是最干净的代码,是的,我确实有suppressWarnings()
。
对于OP的可重复示例:
set.seed(1)
missings <- sample.int(6, 3)
df1 <- data.frame(ID = letters[1:6], V1 = NA)
df2 <- data.frame(ID = letters[1:6], V1 = NA)
df1$V1[missings] <- rnorm(3)
df2$V1[setdiff(1:6, missings)] <- rnorm(3)
简单的解决方案:
library(dplyr)
library(reshape2)
coalesce <- function(...) {
apply((...), 1, function(x) {
x[which(!is.na(suppressWarnings(as.numeric(x))))[1]]
})
}
full_join(df1, df2, by = 'ID') %>% mutate(V1 = coalesce(.)) %>% select(.,ID,V1)
ID V1
1 a -1.5399500
2 b 1.3297993
3 c 0.4146414
4 d -0.9285670
5 e -0.2947204
6 f 1.2724293
对于具有多个(此处显示为3个)变量的一般解决方案:
set.seed(1)
df1 <- data.frame(ID = letters[1:6], V1 = NA, V2 = NA, V3 = NA)
df2 <- data.frame(ID = letters[1:6], V1 = NA, V2 = NA, V3 = NA)
df1$V1[sample.int(6, 3)] <- rnorm(3)
df2$V1[setdiff(1:6, sample.int(6, 3))] <- rnorm(3)
df1$V2[sample.int(6, 3)] <- rnorm(3)
df2$V2[setdiff(1:6, sample.int(6, 3))] <- rnorm(3)
df1$V3[sample.int(6, 3)] <- rnorm(3)
df2$V3[setdiff(1:6, sample.int(6, 3))] <- rnorm(3)
相同的coalesce()
函数,更精细的dplyr
逻辑:
library(dplyr)
library(reshape2)
coalesce <- function(...) {
apply((...), 1, function(x) {
x[which(!is.na(suppressWarnings(as.numeric(x))))[1]]
})
}
full_join(df1, df2, by = "ID") %>%
melt(., id.vars = "ID") %>%
mutate(var = substr(as.character(variable),0,2)) %>%
group_by(var,value) %>%
dcast(.,ID + var ~ variable, value.var = "value") %>%
mutate(c = coalesce(.)) %>%
dcast(.,ID ~ var, value.var = "c")
ID V1 V2 V3
1 a -1.5399500 0.3898432 <NA>
2 b -0.9285670 -0.3053884 0.80418951
3 c -0.8356286 <NA> 0.5939013
4 d 0.1836433 -0.4115108 -0.05710677
5 e <NA> <NA> 0.8212212
6 f -0.6264538 1.5117812 0.9189774
coalesce()
函数选择第一个非NA值(如果存在)。您可以根据问题选择max
或其他内容。合并数据集中的ID
列或任何其他列必须是非数字的。我希望这对解决你的真正问题有所帮助。
答案 3 :(得分:0)
这可能是data.table
方法:
library(data.table)
setDT(df1); setDT(df2);
df1[df2, V1 := ifelse(is.na(V1), i.V1, V1), on = "ID"]
df1
# ID V1
# 1: a -1.5399500
# 2: b 1.3297993
# 3: c 0.4146414
# 4: d -0.9285670
# 5: e -0.2947204
# 6: f 1.2724293