我有a long file我使用readLines
/ strsplit
阅读了一个列表:
> head(edges.split)
[[1]]
[1] "1" "1263895" "4415645" "1798592" "576013" "1315720" "1179526"
[8] "4257735" "4368477" "4045891" "336813" "4257736" "1179526" "3494186"
[15] "4257735" "4257735"
[[2]]
[1] "2" "4831424" "2070750" "3" "798464" "1208032" "351213"
[8] "2816552" "1484206" "4493159" "5" "1" "4" "4493043"
[15] "3126743" "1207504" "1499874" "214487" "173486" "1484207"
[[3]]
[1] "3" "2" "4" "3648046" "1872711" "1275714" "702512"
[8] "1275655" "1667650" "1484207"
[[4]]
[1] "4" "4463893" "3618982" "3624614" "3299496" "4348657" "4104419"
[8] "3070955" "2707725" "5" "4463739" "4158900" "1135360" "653364"
[15] "806185" "2465873" "3299496" "3060623" "1965801" "1005013" "3070955"
[22] "3103098" "4283482" "1951317" "1487656" "4632995" "4402849" "2707725"
[29] "1564441" "576420" "1972753" "1740415" "3070390" "2391329" "3827055"
[36] "996590" "4267592" "3787645" "1857269" "4348657" "3491190" "3787645"
[43] "3149658" "3159019" "3787645" "1135358" "2183685" "2303714" "3159019"
[50] "2465873" "4276571" "4446386" "2854060" "3299496" "1740415" "4402849"
[57] "4632995" "3494237" "2050300" "1135358" "3787645"
[[5]]
[1] "5" "336813" "4" "3159019" "2303714" "1740415" "4"
[8] "305277" "2707725" "2303714" "1740415" "3494237" "1135358" "4"
[[6]]
[1] "6" "499620" "3622792" "1315540" "576013" "1798592" "3965874"
[8] "752451" "1017219" "1762253" "3693356" "348788" "4038359" "336813"
[15] "3449680" "4717601" "3545052" "4494041" "748702" "1093005" "3143747"
[22] "1648572" "1093005" "1648572" "3143747"
现在我想将其转换为3列data.frame
/ data.table
:
edges.df <- do.call(rbind,lapply(edges.split,function (l)
if (length(l) <= 1) NULL
else {
tab <- table(tail(l,-1))
data.table(src=as.integer(l[1]),
dst=as.integer(names(tab)),
weight=as.numeric(tab))
}))
str(edges.df)
str(edges.df) # 156716688x2
Classes ?data.table? and 'data.frame': 116330611 obs. of 3 variables:
$ src : int 1 1 1 1 1 1 1 1 1 1 ...
$ dst : int 1179526 1263895 1315720 1798592 336813 3494186 4045891 4257735 4257736 4368477 ...
$ weight: num 2 1 1 1 1 1 1 3 1 1 ...
这需要 5.5小时并消耗20GB RAM(data.frame
版本直到运行 - 15小时并计算)。
更简单的矩阵版本
edges.df <- do.call(rbind,lapply(edges.split,function (l)
cbind(as.integer(l[1]),as.integer(tail(l,-1)))))
在10分钟内完成,产生156716688x2矩阵。
table
来电是否会产生巨大的时差?
我怎样才能加快这个速度?
答案 0 :(得分:2)
我认为在每次迭代中调用data.table
,as.integer
(两次),as.numeric
和table
等操作只是错误的做法。我建议先使用unnest
中的tidyr
来创建数据集,然后让data.table
参与进来。我没有你的真实数据,但我敢打赌这应该更快
library(tidyr)
library(data.table)
edges.df <- setDT(unnest(edges.split, "src"))[,
.(weight = .N),
keyby = .(src, dst = x)]
输出
head(edges.df)
# src dst weight
# 1: X1 1 1
# 2: X1 1179526 2
# 3: X1 1263895 1
# 4: X1 1315720 1
# 5: X1 1798592 1
# 6: X1 336813 1
答案 1 :(得分:2)
如果我理解你的问题,我会尝试将各个部分拼凑起来然后制成表格。利用rep
和.N
等高效功能来提高效果。
如果没有可重复的数据,我建议尝试类似:
## Extract just the first values of each list element
Nam <- vapply(edges.split, function(x) x[1], character(1L))
## How long is each list element (minus the first element)?
Len <- vapply(edges.split, length, numeric(1L)) - 1
## Put the pieces together and use `.N` to aggregate
data.table(src = rep(Nam, Len),
dst = unlist(lapply(edges.split,
function(x) x[-1])))[
, list(weight = .N), by = .(src, dst)]
但是,应该注意的是,您需要使"Nam"
唯一,以便与您的方法的输出相匹配。
以下是一些基准测试。大卫的功能与输出不完全匹配,但我认为可以很容易地修改它(现在没有时间进行实验)。
首先,功能:
opFun <- function() {
do.call(rbind,lapply(edges.split,function (l)
if (length(l) <= 1) NULL
else {
tab <- table(tail(l,-1))
data.table(src=as.integer(l[1]),
dst=as.integer(names(tab)),
weight=as.numeric(tab))
}))
}
myFun <- function() {
Nam <- vapply(edges.split, function(x) x[1], character(1L))
Nam <- make.unique(Nam)
Len <- vapply(edges.split, length, numeric(1L)) - 1
data.table(src = rep(Nam, Len),
dst = unlist(lapply(edges.split,
function(x) x[-1])))[
, list(weight = .N), by = .(src, dst)]
}
da <- function() {
setDT(unnest(edges.split, "src"))[
, .(weight = .N), keyby = .(src, dst = x)]
}
第二,制作一些样本数据的方法:
data.maker <- function(size) {
set.seed(1)
lapply(seq_len(size), function(x) {
as.character(c(x, sample(100, sample(20), TRUE)))
})
}
第三,时机:
library(microbenchmark)
## 100 list items
edges.split <- data.maker(100)
microbenchmark(opFun(), myFun(), da(), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# opFun() 227.980049 231.180087 235.767195 238.358194 239.68957 240.84357 10
# myFun() 6.276912 6.372855 7.015674 6.700846 6.76109 10.79427 10
# da() 9.984779 10.152121 10.419066 10.350701 10.73314 11.01650 10
## 100k list items
edges.split <- data.maker(100000)
system.time(da())
# user system elapsed
# 9.52 0.11 9.64
system.time(myFun())
# user system elapsed
# 3.03 0.08 3.14
## 1M list items
edges.split <- data.maker(1000000)
system.time(da())
# user system elapsed
# 129.53 2.22 132.51
system.time(myFun())
# user system elapsed
# 31.30 0.71 32.14