合并/加入优先处理非缺失值

时间:2016-06-08 23:23:49

标签: r join dataframe merge dplyr

是否有合并函数优先考虑常见变量的非缺失值?

请考虑以下示例。

首先,我们生成两个具有相同ID但在特定变量上具有补充缺失值的data.frame:

set.seed(1)
missings  <- sample.int(6, 3)
df1  <- data.frame(ID = letters[1:6], V1 = NA)
df2  <- data.frame(ID = letters[1:6], V1 = NA)
df1$V1[missings]  <- rnorm(3)
df2$V1[setdiff(1:6, missings)]  <- rnorm(3)

merge包中应用join或任何dplyr函数会产生类似于以下内容的结果:

> merge(df1, df2, by = 'ID')
  ID      V1.x       V1.y
1  a        NA -1.5399500
2  b 1.3297993         NA
3  c 0.4146414         NA
4  d        NA -0.9285670
5  e        NA -0.2947204
6  f 1.2724293         NA

我们希望以“更智能”的方式加入这两个data.frames,忽略一个data.frame中的缺失值,而另一个data.frame中没有丢失,以获得以下输出:

> output <- df1
> output$V1[is.na(df1$V1)]  <- df2$V1[!(is.na(df2$V1))]
> output
  ID         V1
1  a -1.5399500
2  b  1.3297993
3  c  0.4146414
4  d -0.9285670
5  e -0.2947204
6  f  1.2724293

我们可以假设df1df2具有完全互补的V1缺失值。

修改

适用于任意数量变量的解决方案将是理想的。

4 个答案:

答案 0 :(得分:3)

感谢@Gregor和@StevenBeaupré的非常有用的评论,我提出了一个使用coalesce.nakimisc的解决方案,该解决方案扩展到任意数量的变量:

mapply(function(x,y) coalesce.na(x,y), df1$V1, df2$V1)
[1] -1.5399500  1.3297993  0.4146414 -0.9285670 -0.2947204  1.2724293

请注意df1$V1df2$V1可以替换变量列表,允许类似的内容:

> set.seed(1)
> missings  <- sample.int(6, 3)
> df1  <- data.frame(ID = letters[1:6],
+                    V1 = NA,
+                    V2 = NA)
> df2  <- data.frame(ID = letters[1:6],
+                    V1 = NA,
+                    V2 = NA)
> df1$V1[missings]  <- rnorm(3)
> df2$V1[setdiff(1:6, missings)]  <- rnorm(3)
> df1$V2[setdiff(1:6, missings)]  <- rnorm(3)
> df2$V2[missings]  <- rnorm(3)

> cbind(df1, df2)
  ID        V1           V2 ID         V1         V2
1  a        NA -0.005767173  a -1.5399500         NA
2  b 1.3297993           NA  b         NA -0.7990092
3  c 0.4146414           NA  c         NA -0.2894616
4  d        NA  2.404653389  d -0.9285670         NA
5  e        NA  0.763593461  e -0.2947204         NA
6  f 1.2724293           NA  f         NA -1.1476570

> dfMerged <- merge(df1, df2, by = 'ID')
> xList <- dfMerged[grep("\\.x$", names(dfMerged))]
> yList <- dfMerged[grep("\\.y$", names(dfMerged))]

> mapply(function(x,y) coalesce.na(x,y), xList, yList)
           V1.x         V2.x
[1,] -1.5399500 -0.005767173
[2,]  1.3297993 -0.799009249
[3,]  0.4146414 -0.289461574
[4,] -0.9285670  2.404653389
[5,] -0.2947204  0.763593461
[6,]  1.2724293 -1.147657009

完整的解决方案看起来像是:

library(kimisc)
smartMergeList <- function(dfList, idVar) {
    merged <- Reduce(x = dfList, 
                     f = function(x,y) merge(x, y, by = idVar, all = T))
    xList <- merged[grep("\\.x$", names(merged))]
    yList <- merged[grep("\\.y$", names(merged))]
    merged[names(xList)] <- mapply(function(x,y) coalesce.na(x,y),
                            xList, yList)
    merged[names(yList)] <- NULL
    merged
})

我希望看到更漂亮的东西!

答案 1 :(得分:2)

如果避免指定列是output$V1[is.na(df1$V1)] <- df2$V1[!(is.na(df2$V1))]的唯一问题,那么您只需使用na.omit()而不是单独指定变量。我们也可以仅针对重叠列执行此操作。

让我们修改原始数据,使列不是完美的重叠,额外的列有一些我们想要保留的数据:

set.seed(1)

missings  <- sample.int(6, 3)
df1  <- data.frame(ID = letters[1:6], V1 = NA, V2 = c(NA, 2, 3, NA, 5, 6))
df2  <- data.frame(ID = letters[1:6], V1 = NA)
df1$V1[missings]  <- rnorm(3)
df2$V1[setdiff(1:6, missings)]  <- rnorm(3)

# now df1 looks like this:
df1
#   ID        V1 V2
# 1  a        NA NA
# 2  b 1.3297993  2
# 3  c 0.4146414  3
# 4  d        NA NA
# 5  e        NA  5
# 6  f 1.2724293  6


common_cols = intersect(names(df1), names(df2))
result = na.omit(rbind(df1[common_cols], df2[common_cols]))
result = merge(result, df1, all.x = T)
result = merge(result, df2, all.x = T)
  # the merges are only necessary if there are additional columns to pick up

result
# ID           V1 V2
# 1  a -1.5399500 NA
# 2  b  1.3297993  2
# 3  c  0.4146414  3
# 4  d -0.9285670 NA
# 5  e -0.2947204 NA
# 6  f  1.2724293  6

答案 2 :(得分:1)

根据上面的讨论和答案,这是我使用dplyr的方法。不是最干净的代码,是的,我确实有suppressWarnings()

对于OP的可重复示例:

set.seed(1)
missings  <- sample.int(6, 3)
df1  <- data.frame(ID = letters[1:6], V1 = NA)
df2  <- data.frame(ID = letters[1:6], V1 = NA)
df1$V1[missings]  <- rnorm(3)
df2$V1[setdiff(1:6, missings)]  <- rnorm(3)

简单的解决方案:

library(dplyr)
library(reshape2)
coalesce <- function(...) {
  apply((...), 1, function(x) {
    x[which(!is.na(suppressWarnings(as.numeric(x))))[1]]
  })
}

full_join(df1, df2, by = 'ID') %>% mutate(V1 = coalesce(.)) %>% select(.,ID,V1)

  ID         V1
1  a -1.5399500
2  b  1.3297993
3  c  0.4146414
4  d -0.9285670
5  e -0.2947204
6  f  1.2724293

对于具有多个(此处显示为3个)变量的一般解决方案:

set.seed(1)
df1  <- data.frame(ID = letters[1:6], V1 = NA, V2 = NA, V3 = NA)
df2  <- data.frame(ID = letters[1:6], V1 = NA, V2 = NA, V3 = NA)
df1$V1[sample.int(6, 3)]  <- rnorm(3)
df2$V1[setdiff(1:6, sample.int(6, 3))]  <- rnorm(3)
df1$V2[sample.int(6, 3)]  <- rnorm(3)
df2$V2[setdiff(1:6, sample.int(6, 3))]  <- rnorm(3)
df1$V3[sample.int(6, 3)]  <- rnorm(3)
df2$V3[setdiff(1:6, sample.int(6, 3))]  <- rnorm(3)

相同的coalesce()函数,更精细的dplyr逻辑:

library(dplyr)
library(reshape2)
coalesce <- function(...) {
  apply((...), 1, function(x) {
    x[which(!is.na(suppressWarnings(as.numeric(x))))[1]]
  })
}

full_join(df1, df2, by = "ID") %>% 
  melt(., id.vars = "ID") %>%
  mutate(var = substr(as.character(variable),0,2)) %>%
  group_by(var,value) %>% 
  dcast(.,ID + var ~ variable, value.var = "value") %>%
  mutate(c = coalesce(.)) %>%
  dcast(.,ID ~ var, value.var = "c")

  ID         V1         V2          V3
1  a -1.5399500  0.3898432        <NA>
2  b -0.9285670 -0.3053884  0.80418951
3  c -0.8356286       <NA>   0.5939013
4  d  0.1836433 -0.4115108 -0.05710677
5  e       <NA>       <NA>   0.8212212
6  f -0.6264538  1.5117812   0.9189774

coalesce()函数选择第一个非NA值(如果存在)。您可以根据问题选择max或其他内容。合并数据集中的ID列或任何其他列必须是非数字的。我希望这对解决你的真正问题有所帮助。

答案 3 :(得分:0)

这可能是data.table方法:

library(data.table)
setDT(df1); setDT(df2);

df1[df2, V1 := ifelse(is.na(V1), i.V1, V1), on = "ID"]

df1
#    ID         V1
# 1:  a -1.5399500
# 2:  b  1.3297993
# 3:  c  0.4146414
# 4:  d -0.9285670
# 5:  e -0.2947204
# 6:  f  1.2724293