我有一组包含不同列的数据框。我想将它们按行组合成一个数据帧。我使用plyr::rbind.fill
来做到这一点。我正在寻找能够更有效地完成此任务的内容,但与给出here
require(plyr)
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
setNames(val, nam)
}
ll <- replicate(1e4, sample.fun())
rbind.fill(ll)
答案 0 :(得分:37)
更新:请改为this updated answer。
更新(eddi):现在,这已在version 1.8.11中作为fill
的{{1}}参数实施。例如:
rbind
此解决方案使用DT1 = data.table(a = 1:2, b = 1:2)
DT2 = data.table(a = 3:4, c = 1:2)
rbind(DT1, DT2, fill = TRUE)
# a b c
#1: 1 1 NA
#2: 2 2 NA
#3: 3 NA 1
#4: 4 NA 2
的{{1}}函数来“rbind”data.tables列表,为此,请确保使用版本1.8.9,因为this bug在版本&lt; 1.8.9 强>
data.table
绑定data.frames / data.tables列表时,截至目前,将保留第一列的数据类型。也就是说,如果第一个data.frame中的列是字符,而第二个data.frame中的相同列是“factor”,那么rbindlist
将导致此列成为一个字符。因此,如果您的data.frame由所有字符列组成,那么,使用此方法的解决方案将与plyr方法相同。如果不是,则值仍然相同,但某些列将是字符而不是因子。你必须自己转换为“因子”。 Hopefully this behaviour will change in the future
现在这里使用rbindlist
(与rbindlist
中的data.table
进行基准比较):
rbind.fill
应该注意plyr
的{{1}}边缘超过此特定require(data.table)
rbind.fill.DT <- function(ll) {
# changed sapply to lapply to return a list always
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}
rbind.fill.PLYR <- function(ll) {
rbind.fill(ll)
}
require(microbenchmark)
microbenchmark(t1 <- rbind.fill.DT(ll), t2 <- rbind.fill.PLYR(ll), times=10)
# Unit: seconds
# expr min lq median uq max neval
# t1 <- rbind.fill.DT(ll) 10.8943 11.02312 11.26374 11.34757 11.51488 10
# t2 <- rbind.fill.PLYR(ll) 121.9868 134.52107 136.41375 184.18071 347.74724 10
# for comparison change t2 to data.table
setattr(t2, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(t2, 0L)
invisible(alloc.col(t2))
setcolorder(t2, unique(unlist(sapply(ll, names))))
identical(t1, t2) # [1] TRUE
解决方案,直到列表大小约为500.
以下是带有plyr
的data.frames列表长度的运行图。我在这些不同的列表长度中使用了rbind.fill
和10个代表。
Here's the gist for benchmarking,以防有人想要复制结果。
答案 1 :(得分:15)
现在,rbindlist
的{{1}}(和rbind
}已经使用recent changes/commits in v1.9.3(开发版)改进了功能和速度,data.table
速度更快dplyr
plyr
的版本,名为rbind.fill
,我this answer似乎有点过时了。
以下是rbind_all
的相关新闻条目:
rbindlist
所以,我已经在下面相对较大的数据上对较新的(和更快的版本)进行了基准测试。
我们将创建总共10,000个data.tables,列数范围为200-300,绑定后的列总数为500.
o 'rbindlist' gains 'use.names' and 'fill' arguments and is now implemented entirely in C. Closes #5249
-> use.names by default is FALSE for backwards compatibility (doesn't bind by
names by default)
-> rbind(...) now just calls rbindlist() internally, except that 'use.names'
is TRUE by default, for compatibility with base (and backwards compatibility).
-> fill by default is FALSE. If fill is TRUE, use.names has to be TRUE.
-> At least one item of the input list has to have non-null column names.
-> Duplicate columns are bound in the order of occurrence, like base.
-> Attributes that might exist in individual items would be lost in the bound result.
-> Columns are coerced to the highest SEXPTYPE, if they are different, if/when possible.
-> And incredibly fast ;).
-> Documentation updated in much detail. Closes DR #5158.
以下是时间安排:
require(data.table) ## 1.9.3 commit 1267
require(dplyr) ## commit 1504 devel
set.seed(1L)
names = paste0("V", 1:500)
foo <- function() {
cols = sample(200:300, 1)
data = setDT(lapply(1:cols, function(x) sample(10)))
setnames(data, sample(names)[1:cols])
}
n = 10e3L
ll = vector("list", n)
for (i in 1:n) {
.Call("Csetlistelt", ll, i, foo())
}
答案 2 :(得分:5)
如果您将rbind.fill
和rbindlist
并行化,仍然可以获得一些收益。
使用data.table
版本1.8.8完成了结果,因为当我使用并行化函数尝试它时,版本1.8.9变得很糟糕。因此,data.table
和plyr
之间的结果并不相同,但它们在data.table
或plyr
解决方案中是相同的。并行plyr
的含义与不平行的plyr
匹配,反之亦然。
这是基准/脚本。 parallel.rbind.fill.DT
看起来很可怕,但那是我能拉的最快的。
require(plyr)
require(data.table)
require(ggplot2)
require(rbenchmark)
require(parallel)
# data.table::rbindlist solutions
rbind.fill.DT <- function(ll) {
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}
parallel.rbind.fill.DT <- function(ll, cluster=NULL){
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
if(is.null(cluster)){
ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}else{
cores <- length(cluster)
sequ <- as.integer(seq(1, length(ll), length.out = cores+1))
Call <- paste(paste("list", seq(cores), sep=""), " = ll[", c(1, sequ[2:cores]+1), ":", sequ[2:(cores+1)], "]", sep="", collapse=", ")
ll <- eval(parse(text=paste("list(", Call, ")")))
rbindlist(clusterApply(cluster, ll, function(ll, unq.names){
rbindlist(lapply(seq_along(ll), function(x, ll, unq.names) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% colnames(tt)]) := NA_character_]
setcolorder(tt, unq.names)
}, ll=ll, unq.names=unq.names))
}, unq.names=unq.names))
}
}
# plyr::rbind.fill solutions
rbind.fill.PLYR <- function(ll) {
rbind.fill(ll)
}
parallel.rbind.fill.PLYR <- function(ll, cluster=NULL, magicConst=400){
if(is.null(cluster) | ceiling(length(ll)/magicConst) < length(cluster)){
rbind.fill(ll)
}else{
cores <- length(cluster)
sequ <- as.integer(seq(1, length(ll), length.out = ceiling(length(ll)/magicConst)))
Call <- paste(paste("list", seq(cores), sep=""), " = ll[", c(1, sequ[2:(length(sequ)-1)]+1), ":", sequ[2:length(sequ)], "]", sep="", collapse=", ")
ll <- eval(parse(text=paste("list(", Call, ")")))
rbind.fill(parLapply(cluster, ll, rbind.fill))
}
}
# Function to generate sample data of varying list length
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
setNames(val, nam)
}
ll <- replicate(10000, sample.fun())
cl <- makeCluster(4, type="SOCK")
clusterEvalQ(cl, library(data.table))
clusterEvalQ(cl, library(plyr))
benchmark(t1 <- rbind.fill.PLYR(ll),
t2 <- rbind.fill.DT(ll),
t3 <- parallel.rbind.fill.PLYR(ll, cluster=cl, 400),
t4 <- parallel.rbind.fill.DT(ll, cluster=cl),
replications=5)
stopCluster(cl)
# Results for rbinding 10000 dataframes
# done with 4 cores, i5 3570k and 16gb memory
# test reps elapsed relative
# rbind.fill.PLYR 5 321.80 16.682
# rbind.fill.DT 5 26.10 1.353
# parallel.rbind.fill.PLYR 5 28.00 1.452
# parallel.rbind.fill.DT 5 19.29 1.000
# checking are results equal
t1 <- as.matrix(t1)
t2 <- as.matrix(t2)
t3 <- as.matrix(t3)
t4 <- as.matrix(t4)
t1 <- t1[order(t1[, 1], t1[, 2]), ]
t2 <- t2[order(t2[, 1], t2[, 2]), ]
t3 <- t3[order(t3[, 1], t3[, 2]), ]
t4 <- t4[order(t4[, 1], t4[, 2]), ]
identical(t2, t4) # TRUE
identical(t1, t3) # TRUE
identical(t1, t2) # FALSE, mismatch between plyr and data.table
正如您所看到的那样,rbind.fill
的缩放使其与data.table
相当,即使数据帧数量较低,您也可以通过data.table
对等来提高速度。
答案 3 :(得分:0)
只需 dplyr::bind_rows
即可完成工作,如
library(dplyr)
merged_list <- bind_rows(ll)
#check it
> nrow(merged_list)
[1] 100000
> ncol(merged_list)
[1] 26
耗时
> system.time(merged_list <- bind_rows(ll))
user system elapsed
0.29 0.00 0.28