Question

假设我有两个数据集df1和df2，如下所示：

df1 <- data.frame(Id = c(1L,2L,3L,4L,5L,6L,7L,8L), pricetag = c("na","na","na","na","na","na","na","na"),stringsAsFactors=F)
df2 <- data.frame(Id=c(1L,2L,3L,4L), price = c(10,20,30,40), stringsAsFactors=F)

> df1
  Id pricetag
1  1       na
2  2       na
3  3       na
4  4       na
5  5       na
6  6       na
7  7       na
8  8       na
> df2
  Id price
1  1    10
2  2    20
3  3    30
4  4    40

我正在尝试通过使用此功能匹配ID将df2的价格值插入df1。

df1$pricetag <- df2$price[match(df1$Id, df2$Id)]

提供了以下内容：

> df1
  Id pricetag
1  1       10
2  2       20
3  3       30
4  4       40
5  5       NA
6  6       NA
7  7       NA
8  8       NA

我有第三个数据集。我正在尝试遵循相同的步骤。

df3 <- data.frame(Id=c(5L,6L,7L,8L), price=c(50,60,70,80),stringsAsFactors=F)
> df3
  Id price
1  5    50
2  6    60
3  7    70
4  8    80

df1$pricetag <- df3$price[match(df1$Id, df3$Id)]

> df1
  Id pricetag
1  1       NA
2  2       NA
3  3       NA
4  4       NA
5  5       50
6  6       60
7  7       70
8  8       80

但是，它会覆盖df1中来自df2的价格信息。复制相同的过程时，是否可以关闭此选项？

Answer 1

替换

df1$pricetag <- df3$price[match(df1$Id, df3$Id)]

如果您想使用以下方法进行 update-join （用df3中的数据覆盖df1）：

idx <- match(df1$Id, df3$Id)
idxn <- which(!is.na(idx))
df1$pricetag[idxn] <- df3$price[idx[idxn]]
rm(idx, idxn)
df1
#  Id pricetag
#1  1       10
#2  2       20
#3  3       30
#4  4       40
#5  5       50
#6  6       60
#7  7       70
#8  8       80

如果要创建 gap-fill-join （用df3中的数据填充df1中的NA），则使用：

idxg <- which(is.na(df1$pricetag))
idx <- match(df1$Id[idxg], df3$Id)
idxn <- which(!is.na(idx))
df1$pricetag[idxg][idxn] <- df3$price[idx[idxn]]
rm(idxg, idx, idxn)
df1
#  Id pricetag
#1  1       10
#2  2       20
#3  3       30
#4  4       40
#5  5       50
#6  6       60
#7  7       70
#8  8       80

Answer 2

您可以使用is.na函数来标识要查找的行：

w = which(is.na(df1$pricetag))
df1$pricetag[w] <- df3$price[match(df1$Id[w], df3$Id)]

  Id category pricetag
1  1       na       10
2  2       na       20
3  3       na       30
4  4       na       40
5  5       na       50
6  6       na       60
7  7       na       70
8  8       na       80

data.table软件包为此提供了一些更方便的语法：

df1 <- data.frame(Id=c(1L,2L,3L,4L,5L,6L,7L,8L), category="na", stringsAsFactors=FALSE)

library(data.table)
setDT(df1); setDT(df2); setDT(df3)

df1[, pricetag := NA_real_]
for (odf in list(df2, df3)) 
  df1[is.na(pricetag), 
    pricetag := odf[.SD, on=.(Id), x.price]
][]

   Id category pricetag
1:  1       na       10
2:  2       na       20
3:  3       na       30
4:  4       na       40
5:  5       na       50
6:  6       na       60
7:  7       na       70
8:  8       na       80

这种合并称为“更新联接”。

Answer 3

为此，我制作了软件包 safejoin ：

# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
library(tidyverse)

df1 %>% 
  rename(price = pricetag) %>%
  mutate_at("price", as.numeric) %>% # make regular numeric NAs from your "na" characters
  safe_left_join(df2, "Id", conflict = coalesce) %>% 
  safe_left_join(df3, "Id", conflict = coalesce)

#   Id price
# 1  1    10
# 2  2    20
# 3  3    30
# 4  4    40
# 5  5    50
# 6  6    60
# 7  7    70
# 8  8    80

在r

3 个答案: