Question

我有一个长度为30'000的列表，其中包含数据框，其中包含x和y列。数据帧是稀疏的，因此不存在x的每个值。所有x值都在1到200之间。

我想将此列表转换为单个数据框，每个可能的x值都有一列，每行应代表列表条目的所有y值（如果x值不存在，则条目应为0）。我有一个有效的解决方案（见下文），但它非常非常慢，我认为必须有一个更快（也可能更优雅的方式）。

我目前的解决方案（速度很慢）是：

dat <- matrix(numeric(0), 30000, 200) 
for(i in seq(along=whaledatas)) {
    for(j in row.names(whaledatas[[i]]))
        dat[i, whaledatas[[i]][j,"x"]] <- whaledatas[[i]][j,"y"]
}

dfData <- data.frame(dat, files$label)
dfData[is.na(dfData)] <- 0

Answer 1

这是一个需要花费合理时间的答案：

# function to create dummy data
my_sampler <- function(idx) {
    x <- sample(200, sample(50:100, 1))
    y <- sample(length(x))
    data.frame(x,y)
}

# create list of 30000 data.frames
in.d <- lapply(1:30000, function(x) my_sampler(x))

解决方案：使用data.table

require(data.table)
system.time(out.d <- do.call(rbind, lapply(in.d, function(x) {
    setattr(x, 'class', c("data.table", "data.frame")) # mnel's suggestion
    setkey(x, "x")
    x[J(1:200)]$y
})))

#   user  system elapsed 
# 47.111   0.343  51.283 

> dim(out.d)
# [1] 30000   200

# final step: replace NA with 0
out.d[is.na(out.d)] <- 0

编辑：正如@regetz所示，分配最终矩阵，然后用y值替换x出现的所选条目是聪明的！ @ regetz解决方案的一小部分：

m <- matrix(0.0, nrow=30000, ncol=200)
system.time(for( i in 1:nrow(m)) {
    m[i, in.d[[i]][["x"]]] <- in.d[[i]][["y"]]
})

#   user  system elapsed 
#  1.496   0.003   1.511

这似乎比@ regetz更快（如下所示）：

> system.time(dat <- datify(in.d, xmax=200))
#   user  system elapsed 
#  2.966   0.015   2.993

Answer 2

我会使用data.table解决方案，如下所示：

whaledatas <- lapply(1:30000,function(x)data.frame(x=1:200,y=1:200))
library(data.table)
dtt <- rbindlist(whaledatas)

Answer 3

首先，这是一个数据框列表的小例子：

# create some sample data
whaledatas <- list(
    data.frame(x=1:3, y=11:13),
    data.frame(x=6:10, y=16:20)
)

我认为这与原始问题中的for循环完全相同？

# combine into single data frame
whaledatas.all <- do.call("rbind", whaledatas)

# change this to 200! kept small here for illustration...
XMAX <- 10

# create output matrix
dat <- matrix(0.0, length(whaledatas), XMAX)

# create index vector for dat rows
i <- rep(1:length(whaledatas), sapply(whaledatas, nrow))

# populate dat
dat[cbind(i, whaledatas.all[["x"]])] <- whaledatas.all[["y"]]

修改

随着输入数量的增加，rbind变得非常缓慢。这个版本（为了方便而包含在一个函数中）避免了它，并且运行得更快：

datify <- function(x, xmax=200) { dat <- matrix(0.0, length(x), xmax) for (i in seq_along(x)) { this.df <- x[[i]] coords <- cbind(rep(i, nrow(this.df)), this.df[["x"]]) dat[coords] <- this.df[["y"]] } dat }

请注意，我们开始使用dat中的所有零，因此在事后不需要修复...

> datify(whaledatas, xmax=10) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [1,] 11 12 13 0 0 0 0 0 0 0 [2,] 0 0 0 0 0 16 17 18 19 20

使用Arun的my_sampler函数生成的30k长的样本数据帧列表的时间安排：

set.seed(99) in.d <- lapply(1:30000, function(x) my_sampler(x)) system.time(dat <- datify(in.d, xmax=200)) ## user system elapsed ## 1.317 0.011 1.328

将x / y坐标列表有效地转换为R中的数据帧

3 个答案: