我想找出这两个表之间重叠的对:
> dput(data1)
structure(list(Name.x = c("MDH1", "MDH1", "IDH2", "IDH2", "IDH2",
"IDH2", "IDH2", "IDH2", "IDH2", "SCOALB", "SCOALB", "CSY4", "CSY4",
"CSY4", "CSY4", "CSY4", "FUM1", "FUM1", "IDH6", "IDH6", "IDH6",
"ODC1-1", "ODC1-1", "ODC1-1", "ODC1-1", "ODC1-1", "ODC2-1", "ODC2-1",
"ODC2-1", "ACO2", "IDH1", "IDH1", "IDH1", "IDH1", "ODC2-2"),
Name.y = c("SCOALB", "SCOALA-1", "CSY4", "IDH6", "ODC1-1",
"ODC2-1", "IDH1", "ODC2-2", "ODC1-2", "SCOALA-1", "SCOALA-2",
"IDH6", "SDH2-1", "IDH1", "IDH5", "ICDH", "ODC1-1", "ODC1-2",
"ACO2", "IDH1", "IDH5", "ODC2-1", "IDH1", "IDH5", "ODC2-2",
"ODC1-2", "IDH1", "ODC2-2", "ODC1-2", "IDH1", "IDH5", "SCOALA-2",
"ODC2-2", "ODC1-2", "ODC1-2")), .Names = c("Name.x", "Name.y"
), class = "data.frame", row.names = c(NA, -35L))
> dput(data2)
structure(list(Protein1 = structure(c(3L, 7L, 18L, 19L, 7L, 19L,
6L, 18L, 6L, 18L, 18L, 19L, 9L, 8L, 19L, 18L, 9L, 7L, 18L, 12L,
8L, 19L, 5L, 29L, 12L, 29L, 12L, 18L, 7L, 17L, 6L, 5L, 9L, 19L,
12L, 3L, 19L, 16L, 18L, 17L, 16L, 17L, 9L, 29L, 12L, 7L, 29L,
18L, 16L, 18L, 29L, 8L, 17L, 16L, 17L, 12L, 6L, 8L, 17L, 29L,
9L, 17L, 29L, 19L, 8L, 17L, 29L, 9L, 9L, 16L, 29L, 29L, 19L,
19L, 19L, 29L, 12L, 19L, 17L, 29L, 17L, 16L, 16L, 19L, 16L, 4L,
1L, 5L, 17L, 9L, 18L, 18L, 6L, 4L, 8L, 16L, 16L, 29L, 7L, 12L,
8L, 4L, 29L, 12L, 5L), .Label = c("ACO2", "ACO3", "CSY4", "FUM1",
"ICDH", "IDH1", "IDH2", "IDH5", "IDH6", "LPD1", "LPD2", "MDH1",
"MDH2", "ME1", "ME2", "ODC1-1", "ODC1-2", "ODC2-1", "ODC2-2",
"PDC1a-1", "PDC1a-2", "PDC1b", "PDC2-1", "PDC2-2", "SCoALa-1",
"SCoALa-2", "SCoALb", "SDH1-1", "SDH2-1", "SDH2-2", "SDH2-3",
"SDH3-1", "SDH4", "SDH5", "SDH6", "SDH7a", "SDH7b", "SDH8"), class = "factor"),
Protein2 = structure(c(1L, 6L, 7L, 17L, 1L, 16L, 3L, 9L,
1L, 5L, 17L, 9L, 8L, 7L, 18L, 18L, 5L, 3L, 16L, 3L, 5L, 8L,
4L, 7L, 5L, 3L, 6L, 6L, 5L, 3L, 5L, 3L, 3L, 6L, 7L, 3L, 7L,
9L, 1L, 8L, 5L, 16L, 7L, 6L, 4L, 7L, 4L, 3L, 3L, 12L, 1L,
1L, 9L, 7L, 7L, 9L, 6L, 6L, 5L, 8L, 1L, 17L, 29L, 3L, 8L,
6L, 9L, 9L, 6L, 12L, 5L, 19L, 12L, 5L, 1L, 16L, 1L, 19L,
4L, 18L, 12L, 1L, 4L, 4L, 6L, 3L, 1L, 1L, 1L, 4L, 4L, 8L,
4L, 1L, 3L, 8L, 16L, 12L, 4L, 12L, 4L, 4L, 17L, 8L, 5L), .Label = c("ACO2",
"ACO3", "CSY4", "FUM1", "ICDH", "IDH1", "IDH2", "IDH5", "IDH6",
"LPD1", "LPD2", "MDH1", "MDH2", "ME1", "ME2", "ODC1-1", "ODC1-2",
"ODC2-1", "ODC2-2", "PDC1a-1", "PDC1a-2", "PDC1b", "PDC2-1",
"PDC2-2", "SCoALa-1", "SCoALa-2", "SCoALb", "SDH1-1", "SDH2-1",
"SDH2-2", "SDH2-3", "SDH3-1", "SDH4", "SDH5", "SDH6", "SDH7a",
"SDH7b", "SDH8"), class = "factor")), .Names = c("Protein1",
"Protein2"), class = "data.frame", row.names = c(1L, 4L, 6L,
12L, 22L, 25L, 28L, 33L, 44L, 48L, 51L, 52L, 53L, 60L, 68L, 70L,
72L, 76L, 86L, 109L, 110L, 119L, 133L, 144L, 146L, 158L, 170L,
197L, 202L, 206L, 211L, 213L, 226L, 227L, 237L, 271L, 272L, 286L,
290L, 297L, 304L, 305L, 306L, 319L, 323L, 327L, 347L, 348L, 351L,
357L, 370L, 372L, 373L, 378L, 379L, 392L, 406L, 410L, 414L, 417L,
419L, 437L, 442L, 445L, 448L, 455L, 457L, 462L, 471L, 479L, 482L,
483L, 488L, 503L, 509L, 522L, 536L, 563L, 618L, 620L, 623L, 628L,
630L, 644L, 647L, 666L, 668L, 673L, 676L, 678L, 679L, 690L, 691L,
694L, 698L, 703L, 709L, 714L, 715L, 722L, 723L, 724L, 727L, 739L,
740L))
在每个df
中,有两列存储字符串。表之间的字符串重叠。但是,对之间的顺序可能不同。该对中的一个字符串可能会在data1
的第一列和data2
的第二列中找到。如何找到哪些对以及它们之间有多少重叠数据集?
答案 0 :(得分:5)
> data1$combine = as.character(interaction(data1$Name.x, data1$Name.y))
> data2$combine = as.character(interaction(data2$Protein1, data2$Protein2))
>
> dat.overlap = data1[complete.cases(match(data2$combine, data1$combine)),]
> dat.overlap
Name.x Name.y combine
2 MDH1 SCOALA-1 MDH1.SCOALA-1
4 IDH2 IDH6 IDH2.IDH6
11 SCOALB SCOALA-2 SCOALB.SCOALA-2
13 CSY4 SDH2-1 CSY4.SDH2-1
18 FUM1 ODC1-2 FUM1.ODC1-2
28 ODC2-1 ODC2-2 ODC2-1.ODC2-2
data1[complete.cases(match(data1$combine, data2$combine)),]
Name.x Name.y combine
3 IDH2 CSY4 IDH2.CSY4
7 IDH2 IDH1 IDH2.IDH1
19 IDH6 ACO2 IDH6.ACO2
20 IDH6 IDH1 IDH6.IDH1
21 IDH6 IDH5 IDH6.IDH5
23 ODC1-1 IDH1 ODC1-1.IDH1
24 ODC1-1 IDH5 ODC1-1.IDH5
27 ODC2-1 IDH1 ODC2-1.IDH1
29 ODC2-1 ODC1-2 ODC2-1.ODC1-2
35 ODC2-2 ODC1-2 ODC2-2.ODC1-2
答案 1 :(得分:2)
按行排序并通过粘贴创建一个键,然后合并:
data1$key <- apply(data1, 1, function(i) paste(sort(i), collapse = "_"))
data2$key <- apply(data2, 1, function(i) paste(sort(i), collapse = "_"))
res <- merge(data1, data2, by = "key")
head(res)
# key Name.x Name.y Protein1 Protein2
# 1 ACO2_IDH1 ACO2 IDH1 IDH1 ACO2
# 2 ACO2_IDH6 IDH6 ACO2 IDH6 ACO2
# 3 CSY4_ICDH CSY4 ICDH ICDH CSY4
# 4 CSY4_IDH1 CSY4 IDH1 IDH1 CSY4
# 5 CSY4_IDH2 IDH2 CSY4 IDH2 CSY4
# 6 CSY4_IDH5 CSY4 IDH5 IDH5 CSY4