匹配来自不同来源的数据框中的多个列

时间:2013-07-06 01:08:39

标签: r dataframe

我有不同时间点的多个玩家之间玩游戏的结果。我从两个不同的来源获得这些信息,他们为每个玩家分配不同的唯一ID。我想找到一种雄辩的方式来匹配玩家ID的两个数据源。两个数据源

sourcex <- structure(list(outcomedate = structure(c(12637, 12637, 12637, 
12637, 12637, 12637, 12637, 12637, 12637, 12637, 12637, 12638, 
12639, 12640, 12640, 12640, 12640, 12640, 12640, 12640, 12640, 
12641, 12641, 12641, 12643, 12644, 12644, 12644, 12644, 12644, 
12644, 12644, 12644, 12644, 12644, 12645), class = "Date"), xid1 = c(206L, 
208L, 209L, 216L, 233L, 235L, 239L, 241L, 250L, 253L, 259L, 238L, 
236L, 211L, 221L, 234L, 249L, 254L, 255L, 257L, 258L, 207L, 230L, 
248L, 258L, 207L, 211L, 230L, 234L, 236L, 248L, 249L, 254L, 255L, 
257L, 221L), xid2 = c(211L, 207L, 221L, 249L, 248L, 257L, 234L, 
255L, 236L, 258L, 254L, 230L, 241L, 253L, 235L, 238L, 208L, 233L, 
239L, 259L, 206L, 209L, 250L, 216L, 259L, 216L, 241L, 208L, 235L, 
239L, 253L, 250L, 209L, 238L, 206L, 233L), outcome1 = c(2L, 1L, 
0L, 2L, 1L, 3L, 1L, 1L, 2L, 2L, 0L, 2L, 3L, 3L, 1L, 0L, 2L, 0L, 
0L, 0L, 2L, 1L, 2L, 1L, 0L, 3L, 2L, 0L, 0L, 0L, 2L, 2L, 2L, 1L, 
1L, 1L), outcome2 = c(0L, 0L, 0L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 
0L, 1L, 0L, 1L, 0L, 0L, 1L, 2L, 0L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 
2L, 0L, 1L, 1L, 2L, 1L, 0L, 1L, 1L, 3L)), .Names = c("outcomedate", 
"xid1", "xid2", "outcome1", "outcome2"), row.names = c(NA, 36L
), class = "data.frame")

sourcey <- structure(list(outcomedate = structure(c(12637, 12637, 12637, 
12637, 12637, 12637, 12637, 12637, 12637, 12637, 12637, 12638, 
12639, 12640, 12640, 12640, 12640, 12640, 12640, 12640, 12640, 
12641, 12641, 12641, 12643, 12644, 12644, 12644, 12644, 12644, 
12644, 12644, 12644, 12644, 12644, 12645), class = "Date"), yid1 = c(56, 
46, 67, 68, 59, 63, 55, 50, 66, 61, 57, 58, 53, 60, 64, 48, 69, 
54, 51, 65, 62, 47, 49, 52, 64, 60, 47, 48, 69, 49, 54, 51, 65, 
53, 52, 62), yid2 = c(47, 51, 64, 48, 62, 69, 53, 54, 60, 49, 
65, 52, 50, 63, 57, 56, 61, 46, 58, 67, 66, 59, 68, 55, 63, 57, 
68, 55, 59, 67, 58, 66, 50, 46, 56, 61), outcome1 = structure(c(1L, 
1L, 2L, 2L, 3L, 3L, 2L, 1L, 4L, 1L, 2L, 2L, 4L, 3L, 2L, 2L, 3L, 
3L, 3L, 4L, 1L, 1L, 1L, 2L, 3L, 1L, 4L, 2L, 2L, 2L, 1L, 3L, 2L, 
3L, 3L, 1L), .Label = c("1", "2", "0", "3", "4", "5", "6"), class = "factor"), 
    outcome2 = structure(c(1L, 2L, 3L, 2L, 1L, 1L, 2L, 2L, 3L, 
    2L, 1L, 2L, 1L, 3L, 2L, 2L, 3L, 1L, 1L, 2L, 1L, 3L, 2L, 3L, 
    2L, 2L, 3L, 2L, 1L, 3L, 2L, 2L, 3L, 2L, 1L, 4L), .Label = c("0", 
    "1", "2", "3", "4"), class = "factor")), .Names = c("outcomedate", 
"yid1", "yid2", "outcome1", "outcome2"), row.names = c(NA, 36L
), class = "data.frame")

这两个来源共有outcomedateoutcome1outcome2。他们为游戏中的各个玩家分配不同的ID。我已经完成以下操作来查找ID之间的匹配。

sourcex$ID <- with(sourcex, paste0(outcomedate, outcome1, outcome2))
sourcey$ID <- with(sourcey, paste0(outcomedate, outcome1, outcome2))

uPlayersx <- with(sourcex, unique(c(xid1, xid2)))
uPlayersy <- with(sourcey, unique(c(yid1, yid2)))

comparex <- sapply(uPlayersx, function(x){
  paste0(with(sourcex, ID[xid1 == x| xid2 == x]), collapse = '~')
})

comparey <- sapply(uPlayersy, function(x){
  paste0(with(sourcey, ID[yid1 == x| yid2 == x]), collapse = '~')
})

dumMatch <- data.frame(xid = uPlayersx, yid = uPlayersy[match(comparex, comparey)])

这个测试数据集在这里工作正常但实际应用程序更大,这感觉就像一个黑客。此外,真实数据集可能在报告等方面存在错误,因此可能需要部分匹配。任何帮助将不胜感激。

1 个答案:

答案 0 :(得分:1)

这将(至少)帮助过滤掉完全匹配的日子:

match.day <- function(d)
{
    tempx <- sourcex[sourcex$outcomedate==d,]
    tempy <- sourcey[sourcey$outcomedate==d,]

    if(nrow(tempx)!=nrow(tempy)) stop("matching failed: number of rows differ.")

    P <- outer(tempx$outcome1, tempy$outcome1, `==`) &
         outer(tempx$outcome2, tempy$outcome2, `==`)

    if(any(rowSums(P)!=1)) stop("maching failed: ambiguous or impossible assignment.")

    map <- P %*% seq_len(nrow(tempy))

    cbind(tempx[,c("xid1","xid2")], tempy[map,c("yid1","yid2")])
}

days <- unique(c(sourcex$outcomedate, sourcey$outcomedate))

do.call(rbind, lapply(days[-c(1,4,7)], match.day))

请注意,第1,4和7天失败了(请参阅days[c(1,4,7)])。

其他日子的结果:

   xid1 xid2 yid1 yid2
12  238  230   58   52
13  236  241   53   50
22  207  209   47   59
23  230  250   52   55
24  248  216   49   68
25  258  259   64   63
36  221  233   62   61