do.call/rbind在data.frame / data.table上比在矩阵上慢吗?

时间:2015-04-08 11:17:11

标签: r performance data.table

我有a long file我使用readLines / strsplit阅读了一个列表:

> head(edges.split)
[[1]]
 [1] "1"       "1263895" "4415645" "1798592" "576013"  "1315720" "1179526"
 [8] "4257735" "4368477" "4045891" "336813"  "4257736" "1179526" "3494186"
[15] "4257735" "4257735"

[[2]]
 [1] "2"       "4831424" "2070750" "3"       "798464"  "1208032" "351213" 
 [8] "2816552" "1484206" "4493159" "5"       "1"       "4"       "4493043"
[15] "3126743" "1207504" "1499874" "214487"  "173486"  "1484207"

[[3]]
 [1] "3"       "2"       "4"       "3648046" "1872711" "1275714" "702512" 
 [8] "1275655" "1667650" "1484207"

[[4]]
 [1] "4"       "4463893" "3618982" "3624614" "3299496" "4348657" "4104419"
 [8] "3070955" "2707725" "5"       "4463739" "4158900" "1135360" "653364" 
[15] "806185"  "2465873" "3299496" "3060623" "1965801" "1005013" "3070955"
[22] "3103098" "4283482" "1951317" "1487656" "4632995" "4402849" "2707725"
[29] "1564441" "576420"  "1972753" "1740415" "3070390" "2391329" "3827055"
[36] "996590"  "4267592" "3787645" "1857269" "4348657" "3491190" "3787645"
[43] "3149658" "3159019" "3787645" "1135358" "2183685" "2303714" "3159019"
[50] "2465873" "4276571" "4446386" "2854060" "3299496" "1740415" "4402849"
[57] "4632995" "3494237" "2050300" "1135358" "3787645"

[[5]]
 [1] "5"       "336813"  "4"       "3159019" "2303714" "1740415" "4"      
 [8] "305277"  "2707725" "2303714" "1740415" "3494237" "1135358" "4"      

[[6]]
 [1] "6"       "499620"  "3622792" "1315540" "576013"  "1798592" "3965874"
 [8] "752451"  "1017219" "1762253" "3693356" "348788"  "4038359" "336813" 
[15] "3449680" "4717601" "3545052" "4494041" "748702"  "1093005" "3143747"
[22] "1648572" "1093005" "1648572" "3143747"

现在我想将其转换为3列data.frame / data.table

edges.df <- do.call(rbind,lapply(edges.split,function (l)
  if (length(l) <= 1) NULL
  else {
    tab <- table(tail(l,-1))
    data.table(src=as.integer(l[1]),
               dst=as.integer(names(tab)),
               weight=as.numeric(tab))
  }))
str(edges.df)
str(edges.df) # 156716688x2
Classes ?data.table? and 'data.frame':  116330611 obs. of  3 variables:
 $ src   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ dst   : int  1179526 1263895 1315720 1798592 336813 3494186 4045891 4257735 4257736 4368477 ...
 $ weight: num  2 1 1 1 1 1 1 3 1 1 ...

这需要 5.5小时并消耗20GB RAM(data.frame版本直到运行 - 15小时并计算)。

更简单的矩阵版本

edges.df <- do.call(rbind,lapply(edges.split,function (l)
  cbind(as.integer(l[1]),as.integer(tail(l,-1)))))

在10分钟内完成,产生156716688x2矩阵。

table来电是否会产生巨大的时差? 我怎样才能加快这个速度?

2 个答案:

答案 0 :(得分:2)

我认为在每次迭代中调用data.tableas.integer(两次),as.numerictable等操作只是错误的做法。我建议先使用unnest中的tidyr来创建数据集,然后让data.table参与进来。我没有你的真实数据,但我敢打赌这应该更快

library(tidyr)
library(data.table)
edges.df <- setDT(unnest(edges.split, "src"))[, 
                  .(weight = .N), 
                  keyby = .(src, dst = x)]

输出

head(edges.df)
#    src     dst weight
# 1:  X1       1      1
# 2:  X1 1179526      2
# 3:  X1 1263895      1
# 4:  X1 1315720      1
# 5:  X1 1798592      1
# 6:  X1  336813      1

答案 1 :(得分:2)

如果我理解你的问题,我会尝试将各个部分拼凑起来然后制成表格。利用rep.N等高效功能来提高效果。

如果没有可重复的数据,我建议尝试类似:

## Extract just the first values of each list element
Nam <- vapply(edges.split, function(x) x[1], character(1L))

## How long is each list element (minus the first element)?
Len <- vapply(edges.split, length, numeric(1L)) - 1

## Put the pieces together and use `.N` to aggregate
data.table(src = rep(Nam, Len), 
           dst = unlist(lapply(edges.split, 
                               function(x) x[-1])))[
                                 , list(weight = .N), by = .(src, dst)]

但是,应该注意的是,您需要使"Nam"唯一,以便与您的方法的输出相匹配。


以下是一些基准测试。大卫的功能与输出不完全匹配,但我认为可以很容易地修改它(现在没有时间进行实验)。

首先,功能:

opFun <- function() {
  do.call(rbind,lapply(edges.split,function (l)
    if (length(l) <= 1) NULL
    else {
      tab <- table(tail(l,-1))
      data.table(src=as.integer(l[1]),
                 dst=as.integer(names(tab)),
                 weight=as.numeric(tab))
    }))
} 


myFun <- function() {
    Nam <- vapply(edges.split, function(x) x[1], character(1L))
    Nam <- make.unique(Nam)
    Len <- vapply(edges.split, length, numeric(1L)) - 1

    data.table(src = rep(Nam, Len), 
               dst = unlist(lapply(edges.split, 
                                   function(x) x[-1])))[
                                     , list(weight = .N), by = .(src, dst)]
}

da <- function() {
  setDT(unnest(edges.split, "src"))[
    , .(weight = .N), keyby = .(src, dst = x)]
}

第二,制作一些样本数据的方法:

data.maker <- function(size) {
  set.seed(1)
  lapply(seq_len(size), function(x) {
    as.character(c(x, sample(100, sample(20), TRUE)))
  })
}

第三,时机:

library(microbenchmark)

## 100 list items
edges.split <- data.maker(100)
microbenchmark(opFun(), myFun(), da(), times = 10)
# Unit: milliseconds
#    expr        min         lq       mean     median        uq       max neval
# opFun() 227.980049 231.180087 235.767195 238.358194 239.68957 240.84357    10
# myFun()   6.276912   6.372855   7.015674   6.700846   6.76109  10.79427    10
#    da()   9.984779  10.152121  10.419066  10.350701  10.73314  11.01650    10

## 100k list items
edges.split <- data.maker(100000)
system.time(da())
# user  system elapsed 
# 9.52    0.11    9.64 
system.time(myFun())
# user  system elapsed 
# 3.03    0.08    3.14 

## 1M list items
edges.split <- data.maker(1000000)
system.time(da())
#    user  system elapsed 
#  129.53    2.22  132.51 
system.time(myFun())
#    user  system elapsed 
#   31.30    0.71   32.14