问题:
如何通过使用R tidyverse和dplyr交叉检查2x2列来有效地联接两个表?我是R的新手,但是在以前的任何问题或讨论中都找不到解决的问题。
我有两个表,它们的行和列数不同。每个表包含A列和B列。这些列包含可以相同或唯一的字符串,并且它们也可能在一个或另一列中重叠或丢失。基本上,我需要同时针对A2和B2检查A1列,然后针对A2和B2检查B1。
说明概念的示例:
df1
ID pISSN eISSN Level
437097 1530-9932 1
489309 2366-004X 2366-0058 1
437103 0025-5858 1
437109 1042-9670 1545-7230 1
449363 1093-1139 0
437127 0949-1775 1
437124 0361-3682 1873-6289 2
481203 0103-846X 0103-846X 1
479825 2153-2184 2153-2192 0
437136 0734-2071 1557-7333 2
df2
ID pISSN eISSN Format
41120 2364-9534 E OA S C
12249 1530-9932 E OF S
261 2366-0058 E OF S
12188 0025-5858 1865-8784 PE OF S
40596 1042-9670 1545-7230 PE OF S
12129 0895-4852 1936-4709 PE OF
769 0949-1775 1432-0517 PE OF S
result
ID pISSN eISSN Level Format
437097 1530-9932 1 E OF S
489309 2366-004X 2366-0058 1 E OF S
437103 0025-5858 1865-8784 1 PE OF S
437109 1042-9670 1545-7230 1 PE OF S
437127 0949-1775 1 PE OF S
输入示例表:
dput(df1, file = "")
structure(list(ID = c(437097, 489309, 437103, 437109, 449363, 437127, 437124, 481203, 479825, 437136), pISSN = c(NA, "2366-004X", "0025-5858", "1042-9670", "1093-1139", NA, "0361-3682", "0103-846X", "2153-2184", "0734-2071"), eISSN = c("1530-9932", "2366-0058", NA, "1545-7230", NA, "0949-1775", "1873-6289", "0103-846X", "2153-2192", "1557-7333"), Level = c(1, 1, 1, 1, 0, 1, 2, 1, 0, 2)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
dput(df2, file = "")
structure(list(ID = c(41120, 12249, 261, 12188, 40596, 12129, 769), pISSN = c(NA, NA, NA, "0025-5858", "1042-9670", "0895-4852", "0949-1775"), eISSN = c("2364-9534", "1530-9932", "2366-0058", "1865-8784", "1545-7230", "1936-4709", "1432-0517"), Format = c("E OA S C", "E OF S", "E OF S", "PE OF S", "PE OF S", "PE OF", "PE OF S")), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
答案 0 :(得分:1)
我对您的示例代码以及与dput共享的代码有些困惑,因为我不确定它们之间的关系...但这是我对您的问题的看法:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(stringr)
df1 <- structure(list(ID = c(437097, 489309, 437103, 437109, 449363, 437127, 437124, 481203, 479825, 437136), pISSN = c(NA, "2366-004X", "0025-5858", "1042-9670", "1093-1139", NA, "0361-3682", "0103-846X", "2153-2184", "0734-2071"), eISSN = c("1530-9932", "2366-0058", NA, "1545-7230", NA, "0949-1775", "1873-6289", "0103-846X", "2153-2192", "1557-7333"), Level = c(1, 1, 1, 1, 0, 1, 2, 1, 0, 2)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
df2 <- structure(list(ID = c(41120, 12249, 261, 12188, 40596, 12129, 769), pISSN = c(NA, NA, NA, "0025-5858", "1042-9670", "0895-4852", "0949-1775"), eISSN = c("2364-9534", "1530-9932", "2366-0058", "1865-8784", "1545-7230", "1936-4709", "1432-0517"), Format = c("E OA S C", "E OF S", "E OF S", "PE OF S", "PE OF S", "PE OF", "PE OF S")), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
surrogate_key <- Vectorize(function(x, y) {
str_c(sort(c(x, y)), collapse = "")
})
df1 %>% mutate(join_key = surrogate_key(pISSN, eISSN)) -> df3
df2 %>% mutate(join_key = surrogate_key(pISSN, eISSN)) -> df4
result <- full_join(df3, df4, "join_key") %>%
select(-join_key)
#> Warning: Column `join_key` has different attributes on LHS and RHS of join
result
#> # A tibble: 15 x 8
#> ID.x pISSN.x eISSN.x Level ID.y pISSN.y eISSN.y Format
#> <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
#> 1 437097 <NA> 1530-9932 1 12249 <NA> 1530-9932 E OF S
#> 2 489309 2366-004X 2366-0058 1 NA <NA> <NA> <NA>
#> 3 437103 0025-5858 <NA> 1 NA <NA> <NA> <NA>
#> 4 437109 1042-9670 1545-7230 1 40596 1042-9670 1545-7230 PE OF S
#> 5 449363 1093-1139 <NA> 0 NA <NA> <NA> <NA>
#> 6 437127 <NA> 0949-1775 1 NA <NA> <NA> <NA>
#> 7 437124 0361-3682 1873-6289 2 NA <NA> <NA> <NA>
#> 8 481203 0103-846X 0103-846X 1 NA <NA> <NA> <NA>
#> 9 479825 2153-2184 2153-2192 0 NA <NA> <NA> <NA>
#> 10 437136 0734-2071 1557-7333 2 NA <NA> <NA> <NA>
#> 11 NA <NA> <NA> NA 41120 <NA> 2364-9534 E OA S C
#> 12 NA <NA> <NA> NA 261 <NA> 2366-0058 E OF S
#> 13 NA <NA> <NA> NA 12188 0025-5858 1865-8784 PE OF S
#> 14 NA <NA> <NA> NA 12129 0895-4852 1936-4709 PE OF
#> 15 NA <NA> <NA> NA 769 0949-1775 1432-0517 PE OF S
答案 1 :(得分:1)
我想我现在了解您要实现的目标。
# Step 1
library(magrittr)
suppressMessages(library(dplyr))
library(fuzzyjoin)
# Step 2
df1 <- structure(list(ID = c(437097, 489309, 437103, 437109, 449363, 437127, 437124, 481203, 479825, 437136), pISSN = c(NA, "2366-004X", "0025-5858", "1042-9670", "1093-1139", NA, "0361-3682", "0103-846X", "2153-2184", "0734-2071"), eISSN = c("1530-9932", "2366-0058", NA, "1545-7230", NA, "0949-1775", "1873-6289", "0103-846X", "2153-2192", "1557-7333"), Level = c(1, 1, 1, 1, 0, 1, 2, 1, 0, 2)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
df2 <- structure(list(ID = c(41120, 12249, 261, 12188, 40596, 12129, 769), pISSN = c(NA, NA, NA, "0025-5858", "1042-9670", "0895-4852", "0949-1775"), eISSN = c("2364-9534", "1530-9932", "2366-0058", "1865-8784", "1545-7230", "1936-4709", "1432-0517"), Format = c("E OA S C", "E OF S", "E OF S", "PE OF S", "PE OF S", "PE OF", "PE OF S")), row.names = c(NA, -7L), class = c("tbl_df", "tbl", "data.frame"))
# Step 3
my_match <- function(key1, key2) {
match <- key1 == key2
match[is.na(match)] <- FALSE
return(match)
}
# Step 4
bind_rows(
fuzzy_inner_join(df1, df2,
by = c("pISSN" = "pISSN"),
match_fun = list(my_match)),
fuzzy_inner_join(df1, df2,
by = c("eISSN" = "pISSN"),
match_fun = list(my_match)),
fuzzy_inner_join(df1, df2,
by = c("pISSN" = "eISSN"),
match_fun = list(my_match)),
fuzzy_inner_join(df1, df2,
by = c("eISSN" = "eISSN"),
match_fun = list(my_match))
) %>% # Step 5
mutate(pISSN = coalesce(pISSN.x, pISSN.y),
eISSN = coalesce(eISSN.x, eISSN.y)) %>%
select(-c("pISSN.x", "pISSN.y", "eISSN.x", "eISSN.y")) %>%
select("ID.x", "ID.y", "pISSN", "eISSN", "Level", "Format") -> result
result
#> # A tibble: 6 x 6
#> ID.x ID.y pISSN eISSN Level Format
#> <dbl> <dbl> <chr> <chr> <dbl> <chr>
#> 1 437103 12188 0025-5858 1865-8784 1 PE OF S
#> 2 437109 40596 1042-9670 1545-7230 1 PE OF S
#> 3 437127 769 0949-1775 0949-1775 1 PE OF S
#> 4 437097 12249 <NA> 1530-9932 1 E OF S
#> 5 489309 261 2366-004X 2366-0058 1 E OF S
#> 6 437109 40596 1042-9670 1545-7230 1 PE OF S
magrittr
加载了一些必需的软件包,即:%>%
; dplyr
,用于bind_rows
(类似于rbind
),mutate
和select
; fuzzyjoin
代表fuzzy_inner_join
。df1
和df2
。my_match
。该函数基于相等性进行匹配,但是当涉及NA
时,它将返回FALSE
(不匹配)而不是NA
。fuzzy_inner_join
通过以下键进行四个联接:(i)df1
"pISSN"
和df2
"pISSN"
; (ii)df1
"eISSN"
和df2
"pISSN"
; (iii)df1
"pISSN"
和df2
"eISSN"
; (iv)df1
"eISSN"
和df2
"eISSN"
。这就是我们所谓的交叉检查2x2列的部分。然后,我们将这四个结果数据帧包装在bind_rows
中,以将所有观察值(行)存储在一个数据帧中。mutate
从{{列中创建两个新列pISSN
和eISSN
1}}(最初来自pISSN.x
)和df1
(最初来自pISSN.y
)以及df2
(来自eISSN.x
)和df1
(分别来自eISSN.y
); (ii),我们使用df2
保留/丢弃列。注意:与您预期的select
相反,我输出了两个result
列,一个列来自ID
,另一列来自df1
。在您的帖子中,您仅保留了数据帧df2
中的ID
。但是保留哪一个模棱两可,所以我保留了这两个。您随时可以使用df1
或select(-ID.x)
丢弃其中之一。