使用merge.data.table()
时,我收到有关重复的列名的丑陋错误消息-仅当存在三个以上data.tables时,这才足够奇怪。相反,带有merge()
版本的标准data.frame
可以正常工作。
我使用this code来实现完全连接,避免使用带有merge.data.table()
的时间分隔列。
因此,在 R底
Reduce(function(...) merge(..., all=TRUE), L)
运行正常,而在 data.table
中Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt, names))), L.dt)
处理错误:
Error in merge.data.table(..., all = TRUE, by = Reduce(intersect, lapply(L.dt, :
x has some duplicated column name(s): p.l01.x,p.l02.x,p.l03.x,p.l05.x,p.l06.x,p.l07.x,
p.l08.x,p.l10.x,p.l11.x,p.l12.x,
p.l01.y,p.l02.y,p.l03.y,p.l05.y,p.l06.y,p.l07.y,
p.l08.y,p.l10.y,p.l11.y,p.l12.y. Please remove
or rename the duplicate(s) and try again.
In addition: Warning message:
In merge.data.table(..., all = TRUE, by = Reduce(intersect, lapply(L.dt, :
Error in merge.data.table(..., all = TRUE, by = Reduce(intersect, lapply(L.dt, :
x has some duplicated column name(s): p.l01.x,p.l02.x,p.l03.x,p.l05.x,p.l06.x,p.l07.x,
p.l08.x,p.l10.x,p.l11.x,p.l12.x,
p.l01.y,p.l02.y,p.l03.y,p.l05.y,p.l06.y,p.l07.y,p.l08.y,p.l10.y,p.l11.y,p.l12.y. Please remove
or rename the duplicate(s) and try again.
注意,该问题似乎是某种程度的累积...
而
Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:2], names))), L.dt[1:2])
Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:3], names))), L.dt[1:3])
Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:4], names))), L.dt[1:4])
仍然可以正常运行
Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:5], names))), L.dt[1:5])
发出警告,从这里
Reduce(function(...)
merge(..., all=TRUE, by=Reduce(intersect, lapply(L.dt[1:6], names))), L.dt[1:6])
最终错误开始。
因此, merge.data.table()
似乎工作正常,直到data.tables的数量超过3个?有什么想法可以解决这个问题吗?
注意:使用easy example data不会出现错误。因此可能是由于我的数据引起的。但是我不知道为什么-标准merge()
可以正常工作!我的真实数据大约是dim = 15.000 * 500,某些变量名在整个列表中匹配,另一些变量名是单独的。此MCVE似乎很好地重现了该问题。
# names vector mimicking my data
nm <- list(c("p.d17m", "p.d17", "p.d29", "p.d31", "p.n03", "p.n04",
"p.n05", "p.d36", "p.d40", "p.d41", "p.d45", "p.d46", "p.d50",
"p.d51", "p.d55", "p.d56", "p.d60", "p.d61", "p.d65", "p.d66",
"p.d70", "p.d71", "p.n08"), c("p.d17m", "p.d17", "p.c44", "p.l01",
"p.l02", "p.l03", "p.l05", "p.l06", "p.l07", "p.l08", "p.l10",
"p.l11", "p.l12", "p.l13", "p.l15", "p.l16", "p.l17", "p.l18",
"p.l20", "p.l21", "p.l22", "p.l23", "p.l25"), c("p.d17m", "p.d17",
"p.c44", "p.l01", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07",
"p.l08", "p.l10", "p.l11", "p.l12", "p.l13", "p.l15", "p.l16",
"p.l17", "p.l18", "p.l20", "p.l21", "p.l22", "p.l23", "p.l25"
), c("p.d17m", "p.d17", "p.c44", "p.l01", "p.l01r", "p.l02",
"p.l03", "p.l05", "p.l06", "p.l07", "p.l08", "p.l10", "p.l11",
"p.l12", "p.l13", "p.l15", "p.l16", "p.l17", "p.l18", "p.l20",
"p.l21", "p.l22", "p.l23"), c("p.d17m", "p.d17", "p.c44", "p.l01",
"p.l01r", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07", "p.l08",
"p.l10", "p.l11", "p.l12", "p.l13", "p.l15", "p.l16", "p.l17",
"p.l18", "p.l20", "p.l21", "p.l22", "p.l23"), c("pdate.", "p.d17m",
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165",
"p.d166", "p.d167", "p.c44", "p.l01", "p.l01r", "p.l02", "p.l03",
"p.l05", "p.l06", "p.l07", "p.l08", "p.l10", "p.l11", "p.l12"
), c("pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162",
"p.d163", "p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.l01",
"p.l01r", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07", "p.l08",
"p.l10", "p.l11", "p.l12"), c("pdate.", "p.d200", "p.d201", "p.d17m",
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165",
"p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50",
"p.l01", "p.l01r", "p.l02", "p.l03", "p.l05"), c("pdate.", "p.d17m",
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165",
"p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50",
"p.l01", "p.l01r", "p.l02", "p.l03", "p.l05", "p.l06", "p.l07"
), c("pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162",
"p.d163", "p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c47",
"p.c48", "p.c49", "p.c50", "p.l01", "p.l01r", "p.l02", "p.l03",
"p.l05", "p.l06", "p.l07"), c("plingu.", "pdate.", "p.d17m",
"p.d17", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165",
"p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50",
"p.c60", "p.c61", "p.c62", "p.c63", "p.c64", "p.c65"), c("hab_ch.",
"plingu.", "pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162",
"p.d163", "p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c47",
"p.c48", "p.c49", "p.c50", "p.c60", "p.c61", "p.c62", "p.c63",
"p.c64"), c("hab_ch.", "plingu.", "pdate.", "p.d17m", "p.d17",
"p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165", "p.d166",
"p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50", "p.c60",
"p.c61", "p.c62", "p.c63", "p.c64"), c("hab_ch.", "plingu.",
"pdate.", "p.d17m", "p.d17", "p.d160", "p.d161", "p.d162", "p.d163",
"p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c100", "p.c101",
"p.c102", "p.c103", "p.c47", "p.c48", "p.c49", "p.c50", "p.l01"
), c("hab_ch.", "plingu.", "pdate.", "p.d17m", "p.d17", "p.d110a",
"p.d110b", "p.d160", "p.d161", "p.d162", "p.d163", "p.d164",
"p.d165", "p.d166", "p.d167", "p.c44", "p.c47", "p.c48", "p.c49",
"p.c50", "p.l01", "p.l01r", "p.l02"), c("hab_ch.", "plingu.",
"pdate.", "p.d17m", "p.d17", "p.d110a", "p.d110b", "p.d160",
"p.d161", "p.d162", "p.d163", "p.d164", "p.d165", "p.d166", "p.d167",
"p.c44", "p.c47", "p.c48", "p.c49", "p.c50", "p.l01", "p.l01r",
"p.l02"), c("hab_ch.", "plingu.", "pdate.", "p.d17m", "p.d17",
"p.d110a", "p.d110b", "p.d160", "p.d161", "p.d162", "p.d163",
"p.d164", "p.d165", "p.d166", "p.d167", "p.c44", "p.c100", "p.c101",
"p.c102", "p.c103", "p.c47", "p.c48", "p.c49"), c("hab_ch.",
"plingu.", "pdate.", "p.d17m", "p.d17", "p.d110a", "p.d110b",
"p.d160", "p.d161", "p.d162", "p.d163", "p.d164", "p.d165", "p.d166",
"p.d167", "p.c44", "p.c47", "p.c48", "p.c49", "p.c50", "p.l01",
"p.l01r", "p.l02"))
n <- 10 # row numbers
fun <- function(x) {
# fun() produces a n x 22 data.frame
set.seed(x)
d <- data.frame(
cbind(id=1:n, yr=x,
matrix(rnorm(n*(20)), n)))
}
tmp <- setNames(lapply(1:6, fun), paste0("df", 1:6)) # produce six data frames
list2env(tmp, globalenv()) # get data frames into globalenv
# insert extra columns into some df's
df3 <- cbind(id=df3[, 1], foo=sample(1e3:2e3, nrow(df3), replace=TRUE), df3[, -1])
df4 <- cbind(id=df4[, 1], foo=sample(1e3:2e3, nrow(df4), replace=TRUE), df4[, -1])
df5 <- cbind(id=df5[, 1], foo=sample(1e3:2e3, nrow(df5), replace=TRUE), df5[, -1])
df6 <- cbind(id=df6[, 1], foo=sample(1e3:2e3, nrow(df5), replace=TRUE), df6[, -1])
# giving var names:
names(df1)[3:22] <- nm[[1]][3:22]
names(df2)[3:22] <- nm[[2]][3:22]
names(df3)[4:23] <- nm[[3]][4:23]
names(df4)[4:23] <- nm[[4]][4:23]
names(df5)[4:23] <- nm[[5]][4:23]
names(df5)[4:23] <- nm[[6]][4:23]
# list of data.frames:
L <- list(df1, df2, df3, df4, df5, df6)
# list of data.tables:
library(data.table)
L.dt <- lapply(L, function(x) setkeyv(data.table(x), c("id", "yr")))