Question

我想基于前两列及其值合并两个数据帧，但是，可以根据数据集在这些列之间切换这些值。因此，merge包中的left_join或dplyr函数看不到成对信息相同。

为了得到更好的解释，我在此处定义了两个假设的数据集：

tree.dat1 = data.frame(tree1 = factor(c(rep(33,3),rep(22,2),11)),
+                       tree2 = factor(c(22,11,44,11,44,44)),
+                       value = c(0.02, rep(0.03,3), rep(0.01,2)))

> tree.dat1
   tree1 tree2 value
1    33    22  0.02
2    33    11  0.03
3    33    44  0.03
4    22    11  0.03
5    22    44  0.01
6    11    44  0.01

tree.dat2 = data.frame(tree1 = factor(c(rep(11,3),rep(33,2),22)),
+                        tree2 = factor(c(22,33,44,22,44,44)),
+                        value1 = c(rep(3,0.05),0.02,rep(0.03,2)))
> tree.dat2
  tree1 tree2 value1
1    11    22   0.02
2    11    33   0.03
3    11    44   0.03
4    33    22   0.02
5    33    44   0.03
6    22    44   0.03

如您所见，前两列的成对是相同的，但是它们的顺序不同。因此，我想通过在前两列中使用此信息并在这些数据集中保留这两列来创建新的数据集。

所以：

> tree.dat3 = left_join(tree.dat1,tree.dat2, by = c("tree1","tree2"))
> tree.dat3
   tree1 tree2 value value1
1    33    22  0.02   0.02
2    33    11  0.03     NA
3    33    44  0.03   0.03
4    22    11  0.03     NA
5    22    44  0.01   0.03
6    11    44  0.01   0.03

最后我有两个NA值，但是，当我查看成对表时，可以看到tree.dat1数据集中给出了成对33-11（或22-11）的信息。

所以预期的输出是：

   tree1 tree2 value value1
1    33    22  0.02   0.02
2    33    11  0.03   0.03
3    33    44  0.03   0.03
4    22    11  0.03   0.02
5    22    44  0.01   0.03
6    11    44  0.01   0.03

所以我可能正在寻找其他方法来合并两个数据帧以检查成对信息，而不是检查两列中的因子水平。因为33-11和11-33相同，但是第三列中的值不同。我想知道一种适用于大型数据集的合适方法。有什么建议吗？

Answer 1

由于连接的顺序无关紧要，所以我们创建一个TreeID列，在其中对tree1和tree2进行排序，以便数字对始终以相同的顺序出现。 / p>

如果您的数据未编码为factor，这会更容易，因为将min/max用作一个因子是行不通的，并且您必须先强制字符，然后使用数字才能使其正常工作一般。如果您需要对源数据执行此操作，则as.numeric(as.character(tree.dat1$tree))可以解决问题。您可以提取character向量的最大值而不转换为numeric，但我不愿意这样做，因为max("11","2")的执行效果不理想。

library(tidyverse)
library(stringr)

tree.dat1 = data.frame(tree1 = c(rep(33,3),rep(22,2),11),
                       tree2 = c(22,11,44,11,44,44),
                       value = c(0.02, rep(0.03,3), rep(0.01,2)))

tree.dat2 = data.frame(tree1 = c(rep(11,3),rep(33,2),22),
                       tree2 = c(22,33,44,22,44,44),
                        value1 = c(rep(3,0.05),0.02,rep(0.03,2)))

通过组合TreeID和tree1的最小值和最大值来构造tree2。我们使用rowwise()来获取每一行而不是每一列的最大值和最小值。

tree.dat1 <- tree.dat1 %>% rowwise() %>% 
  mutate(TreeID= str_c(min(tree1, tree2), max(tree1,tree2)))

tree.dat2 <- tree.dat2 %>% rowwise() %>% 
  mutate(TreeID= str_c(min(tree1, tree2), max(tree1,tree2)))

left_join(tree.dat1, tree.dat2, by = "TreeID")


Source: local data frame [6 x 7]
Groups: <by row>

# A tibble: 6 x 7
  tree1.x tree2.x value TreeID tree1.y tree2.y value1
    <dbl>   <dbl> <dbl> <chr>    <dbl>   <dbl>  <dbl>
1      33      22  0.02 2233        33      22   0.02
2      33      11  0.03 1133        11      33   0.03
3      33      44  0.03 3344        33      44   0.03
4      22      11  0.03 1122        11      22   0.02
5      22      44  0.01 2244        22      44   0.03
6      11      44  0.01 1144        11      44   0.03

要完全匹配您所需的输出：

left_join(tree.dat1, tree.dat2, by = "TreeID") %>% select(-tree1.y, -tree2.y, -TreeID) %>% 
  rename(tree1 = tree1.x, tree2 = tree2.x)

  tree1 tree2 value value1
  <dbl> <dbl> <dbl>  <dbl>
1    33    22  0.02   0.02
2    33    11  0.03   0.03
3    33    44  0.03   0.03
4    22    11  0.03   0.02
5    22    44  0.01   0.03
6    11    44  0.01   0.03

Answer 2

这是基本的R解决方案。因素可能会使您的问题复杂化。如果必须使用它们，则可以在项目中转换为字符。

tree.dat1 = data.frame(tree1 = (c(rep(33,3),rep(22,2),11)),
                       tree2 = (c(22,11,44,11,44,44)),
                       value = c(0.02, rep(0.03,3), rep(0.01,2)))

tree.dat2 = data.frame(tree1 = (c(rep(11,3),rep(33,2),22)),
                        tree2 = (c(22,33,44,22,44,44)),
                        value1 = c(rep(3,0.05),0.02,rep(0.03,2)))

tree.dat1$id=apply(tree.dat1[,1:2], 1, function(x)paste(sort(x), collapse="-"))
tree.dat2$id=apply(tree.dat2[,1:2], 1, function(x)paste(sort(x), collapse="-"))

tree.dat3 = left_join(tree.dat1,tree.dat2[,3:4], by = "id")[,-4]


> tree.dat3
  tree1 tree2 value value1
1    33    22  0.02   0.02
2    33    11  0.03   0.03
3    33    44  0.03   0.03
4    22    11  0.03   0.02
5    22    44  0.01   0.03
6    11    44  0.01   0.03

根据前两列合并两个数据帧，并在R

2 个答案: