如何正确合并(完全联接)三个以上data.tables的列表?

时间:2018-11-23 14:10:50

标签: r merge data.table

使用merge.data.table()时,我收到有关重复的列名的丑陋错误消息-仅当存在三个以上data.tables时,这才足够奇怪。相反,带有merge()版本的标准data.frame可以正常工作。

我使用this code来实现完全连接,避免使用带有merge.data.table()的时间分隔列。

因此,在 R底

Reduce(function(...) merge(..., all=TRUE), L)

运行正常,而在 data.table

Reduce(function(...)
  merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt, names))), L.dt)

处理错误:

  Error in merge.data.table(..., all = TRUE, by = Reduce(intersect, lapply(L.dt,  : 
  x has some duplicated column name(s): p.l01.x,p.l02.x,p.l03.x,p.l05.x,p.l06.x,p.l07.x,
p.l08.x,p.l10.x,p.l11.x,p.l12.x,
p.l01.y,p.l02.y,p.l03.y,p.l05.y,p.l06.y,p.l07.y,
p.l08.y,p.l10.y,p.l11.y,p.l12.y. Please remove 
or rename the duplicate(s) and try again.
In addition: Warning message:
In merge.data.table(..., all = TRUE, by = Reduce(intersect, lapply(L.dt,  :

 Error in merge.data.table(..., all = TRUE, by = Reduce(intersect, lapply(L.dt,  : 
  x has some duplicated column name(s): p.l01.x,p.l02.x,p.l03.x,p.l05.x,p.l06.x,p.l07.x,
p.l08.x,p.l10.x,p.l11.x,p.l12.x,
p.l01.y,p.l02.y,p.l03.y,p.l05.y,p.l06.y,p.l07.y,p.l08.y,p.l10.y,p.l11.y,p.l12.y. Please remove
or rename the duplicate(s) and try again. 

注意,该问题似乎是某种程度的累积...

Reduce(function(...)
  merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:2], names))), L.dt[1:2])

Reduce(function(...)
  merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:3], names))), L.dt[1:3])

Reduce(function(...)
  merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:4], names))), L.dt[1:4])

仍然可以正常运行

Reduce(function(...)
  merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:5], names))), L.dt[1:5])

发出警告,从这里

Reduce(function(...)
  merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:6], names))), L.dt[1:6])

最终错误开始。

因此, merge.data.table()似乎工作正常,直到data.tables的数量超过3个?有什么想法可以解决这个问题吗?

注意:使用easy example data不会出现错误。因此可能是由于我的数据引起的。但是我不知道为什么-标准merge()可以正常工作!我的真实数据大约是dim = 15.000 * 500,某些变量名在整个列表中匹配,另一些变量名是单独的。此MCVE似乎很好地重现了该问题。

要复制的数据和代码

# names vector mimicking my data   
nm <- list(c("p.d17m", "p.d17", "p.d29", "p.d31", "p.n03", "p.n04", 
"p.n05", "p.d36", "p.d40", "p.d41", "p.d45", "p.d46", "p.d50", 
"p.d51", "p.d55", "p.d56", "p.d60", "p.d61", "p.d65", "p.d66", 
"p.d70", "p.d71", "p.n08"), c("p.d17m", "p.d17", "p.c44", "p.l01", 
"p.l02", "p.l03", "p.l05", "p.l06", "p.l07", "p.l08", "p.l10", 
"p.l11", "p.l12", "p.l13", "p.l15", "p.l16", "p.l17", "p.l18", 
"p.l20", "p.l21", "p.l22", "p.l23", "p.l25"), c("p.d17m", "p.d17", 
"p.c44", "p.l01", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07", 
"p.l08", "p.l10", "p.l11", "p.l12", "p.l13", "p.l15", "p.l16", 
"p.l17", "p.l18", "p.l20", "p.l21", "p.l22", "p.l23", "p.l25"
), c("p.d17m", "p.d17", "p.c44", "p.l01", "p.l01r", "p.l02", 
"p.l03", "p.l05", "p.l06", "p.l07", "p.l08", "p.l10", "p.l11", 
"p.l12", "p.l13", "p.l15", "p.l16", "p.l17", "p.l18", "p.l20", 
"p.l21", "p.l22", "p.l23"), c("p.d17m", "p.d17", "p.c44", "p.l01", 
"p.l01r", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07", "p.l08", 
"p.l10", "p.l11", "p.l12", "p.l13", "p.l15", "p.l16", "p.l17", 
"p.l18", "p.l20", "p.l21", "p.l22", "p.l23"), c("pdate.", "p.d17m", 
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165", 
"p.d166", "p.d167", "p.c44", "p.l01", "p.l01r", "p.l02", "p.l03", 
"p.l05", "p.l06", "p.l07", "p.l08", "p.l10", "p.l11", "p.l12"
), c("pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162", 
"p.d163", "p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.l01", 
"p.l01r", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07", "p.l08", 
"p.l10", "p.l11", "p.l12"), c("pdate.", "p.d200", "p.d201", "p.d17m", 
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165", 
"p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50", 
"p.l01", "p.l01r", "p.l02", "p.l03", "p.l05"), c("pdate.", "p.d17m", 
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165", 
"p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50", 
"p.l01", "p.l01r", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07"
), c("pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162", 
"p.d163", "p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c47", 
"p.c48", "p.c49", "p.c50", "p.l01", "p.l01r", "p.l02", "p.l03", 
"p.l05", "p.l06", "p.l07"), c("plingu.", "pdate.", "p.d17m", 
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165", 
"p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50", 
"p.c60", "p.c61", "p.c62", "p.c63", "p.c64", "p.c65"), c("hab_ch.", 
"plingu.", "pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162", 
"p.d163", "p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c47", 
"p.c48", "p.c49", "p.c50", "p.c60", "p.c61", "p.c62", "p.c63", 
"p.c64"), c("hab_ch.", "plingu.", "pdate.", "p.d17m", "p.d17", 
"p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165", "p.d166", 
"p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50", "p.c60", 
"p.c61", "p.c62", "p.c63", "p.c64"), c("hab_ch.", "plingu.", 
"pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162", "p.d163", 
"p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c100", "p.c101", 
"p.c102", "p.c103", "p.c47", "p.c48", "p.c49", "p.c50", "p.l01"
), c("hab_ch.", "plingu.", "pdate.", "p.d17m", "p.d17", "p.d110a", 
"p.d110b", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", 
"p.d165", "p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49", 
"p.c50", "p.l01", "p.l01r", "p.l02"), c("hab_ch.", "plingu.", 
"pdate.", "p.d17m", "p.d17", "p.d110a", "p.d110b", "p.d160", 
"p.d161", "p.d162", "p.d163", "p.d164", "p.d165", "p.d166", "p.d167", 
"p.c44", "p.c47", "p.c48", "p.c49", "p.c50", "p.l01", "p.l01r", 
"p.l02"), c("hab_ch.", "plingu.", "pdate.", "p.d17m", "p.d17", 
"p.d110a", "p.d110b", "p.d160", "p.d161", "p.d162", "p.d163", 
"p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c100", "p.c101", 
"p.c102", "p.c103", "p.c47", "p.c48", "p.c49"), c("hab_ch.", 
"plingu.", "pdate.", "p.d17m", "p.d17", "p.d110a", "p.d110b", 
"p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165", "p.d166", 
"p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50", "p.l01", 
"p.l01r", "p.l02"))

n <- 10  # row numbers

fun <- function(x) {
  # fun() produces a n x 22 data.frame
  set.seed(x)
  d <- data.frame(
    cbind(id=1:n, yr=x, 
          matrix(rnorm(n*(20)), n)))
}

tmp <- setNames(lapply(1:6, fun), paste0("df", 1:6))  # produce six data frames

list2env(tmp, globalenv())  # get data frames into globalenv

# insert extra columns into some df's
df3 <- cbind(id=df3[, 1], foo=sample(1e3:2e3, nrow(df3), replace=TRUE), df3[, -1])
df4 <- cbind(id=df4[, 1], foo=sample(1e3:2e3, nrow(df4), replace=TRUE), df4[, -1])
df5 <- cbind(id=df5[, 1], foo=sample(1e3:2e3, nrow(df5), replace=TRUE), df5[, -1])
df6 <- cbind(id=df6[, 1], foo=sample(1e3:2e3, nrow(df5), replace=TRUE), df6[, -1])

# giving var names:
names(df1)[3:22] <- nm[[1]][3:22]
names(df2)[3:22] <- nm[[2]][3:22]
names(df3)[4:23] <- nm[[3]][4:23]
names(df4)[4:23] <- nm[[4]][4:23]
names(df5)[4:23] <- nm[[5]][4:23]
names(df5)[4:23] <- nm[[6]][4:23]

# list of data.frames:
L <- list(df1, df2, df3, df4, df5, df6)

# list of data.tables:
library(data.table)
L.dt <- lapply(L, function(x) setkeyv(data.table(x), c("id", "yr")))

还与此相关: 12

0 个答案:

没有答案