提高数据效率

时间:2018-06-06 05:07:55

标签: r performance data.table grouping

我正在寻找提高以下计算效率的方法

jaccard3 <- function(a,b) sum(a %in% b)/length(unique(c(a,b)))

temp <- dlong[, {print(.GRP)
  x = pp[pnum == origpat, iid]; y = pp[pnum == ref.pat, iid]; .("overlap" = sum(x %in% y), "inv.jac" = jaccard3(x,y))}, by = idx]

我依赖的两个data.tables如下所示

sapply(dlong, class)
  origpat   ref.pat       jac   originv    refinv       idx 
"numeric" "numeric" "numeric" "integer" "integer" "integer"

sapply(pp, class)
       pnum   inventors      pryear     pr_date    inv_patc         iid 
  "integer" "character"   "integer"      "Date"   "numeric"   "integer"

因此,我的目标是在origpat和ref.pat中查找发明者(它们存储在pp中作为inventors列中的字符存储,并作为整数存储在iid中[我假设整数匹配应该比字符匹配更快。

dlong的结构使得origpatref.pat的值会出现多次(origpat可能连续出现500次)但是origpatref.pat是唯一的。 dlong有大约1,100万行,每一行由唯一的origpat ref.pat组合定义,而pp有大约230,000行,每行由唯一的pnum iid组合定义。

我添加了一个非常简短的数据摘录[评论后更新]

 dput(dlong[1:100])
structure(list(origpat = c(4246034, 4246034, 4246034, 4246034, 
4246034, 4246034, 4246034, 4246034, 4247592, 4247592, 4247592, 
4247592, 4248614, 4248614, 4248614, 4248614, 4248614, 4248614, 
4248614, 4248614, 4248761, 4251278, 4251278, 4251278, 4251278, 
4251278, 4251278, 4251278, 4251278, 4251278, 4253719, 4253719, 
4267550, 4273567, 4273567, 4273567, 4273567, 4273567, 4273567, 
4273568, 4273568, 4273568, 4273568, 4273568, 4273568, 4273569, 
4273569, 4273569, 4273569, 4273569, 4273569, 4273586, 4273586, 
4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 
4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 
4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 
4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 
4273813, 4273813, 4277285, 4277286, 4277286, 4277286, 4277286, 
4277286, 4277286, 4277286, 4277286, 4277286, 4277286, 4277286, 
4277286, 4277286, 4277286, 4280829, 4280829), ref.pat = c(4293439, 
4277285, 4328022, 4248614, 4297139, 4395271, 4294190, 4285730, 
4431982, 4303735, 4309520, 4315970, 4358181, 4478622, 4312654, 
4293439, 4406517, 4478623, 4277285, 4328022, 4377675, 4293439, 
4277285, 4328022, 4248614, 4297139, 4395271, 4294190, 4285730, 
4246034, 4491379, 4534098, 4338590, 4358181, 4478622, 4312654, 
4406517, 4478623, 4248614, 4358181, 4478622, 4312654, 4406517, 
4478623, 4248614, 4358181, 4478622, 4312654, 4406517, 4478623, 
4248614, 4293439, 4277285, 4328022, 4248614, 4297139, 4395271, 
4294190, 4285730, 4246034, 4251278, 4339174, 4282035, 4285731, 
4311504, 4277286, 4298389, 4362819, 4360567, 4295872, 4302235, 
4331770, 4282022, 4304584, 4344670, 4297193, 4292080, 4284686, 
4385802, 4311529, 4314031, 4310651, 4534948, 4293439, 4293439, 
4277285, 4328022, 4248614, 4297139, 4395271, 4294190, 4285730, 
4246034, 4251278, 4339174, 4282035, 4285731, 4311504, 4358181, 
4478622), jac = c(0.0714285714285714, 0.333333333333333, 0.5, 
0.25, 0.2, 0.2, 0.25, 0.5, 0.0909090909090909, 0.0714285714285714, 
0.25, 0.142857142857143, 0.125, 0.2, 0.25, 0.0714285714285714, 
0.333333333333333, 0.333333333333333, 0.333333333333333, 0.5, 
0.142857142857143, 0.0714285714285714, 0.333333333333333, 0.5, 
0.25, 0.2, 0.2, 0.25, 0.5, 0.5, 1, 0.2, 0.0769230769230769, 0.125, 
0.2, 0.25, 0.333333333333333, 0.333333333333333, 0.25, 0.125, 
0.2, 0.25, 0.333333333333333, 0.333333333333333, 0.25, 0.125, 
0.2, 0.25, 0.333333333333333, 0.333333333333333, 0.25, 0.0714285714285714, 
0.333333333333333, 0.5, 0.25, 0.2, 0.2, 0.25, 0.5, 0.5, 1, 0.25, 
0.5, 0.2, 0.2, 0.333333333333333, 1, 0.333333333333333, 0.166666666666667, 
1, 0.25, 0.333333333333333, 0.5, 0.5, 0.125, 0.333333333333333, 
1, 0.142857142857143, 0.0769230769230769, 0.333333333333333, 
0.333333333333333, 0.1, 0.111111111111111, 0.0714285714285714, 
0.0714285714285714, 0.333333333333333, 0.5, 0.25, 0.2, 0.2, 0.25, 
0.5, 0.5, 1, 0.25, 0.5, 0.2, 0.2, 0.125, 0.2), idx = 1:100), sorted = "origpat", class = c("data.table", 
"data.frame"), row.names = c(NA, -100L), .internal.selfref = <pointer: 0x000000000f4a1ef0>)

    dput(pp[1:75])
structure(list(pnum = c(4246034, 4247592, 4248614, 4248761, 4251278, 
4253719, 4262069, 4262069, 4262069, 4262575, 4267550, 4273567, 
4273567, 4273568, 4273568, 4273569, 4273586, 4273586, 4273586, 
4273586, 4273813, 4275419, 4277285, 4277285, 4277285, 4277286, 
4277286, 4277322, 4279234, 4279234, 4280829, 4281094, 4281094, 
4282003, 4282003, 4282022, 4282022, 4282035, 4282578, 4282578, 
4282648, 4282648, 4282648, 4283218, 4283439, 4283439, 4283439, 
4284213, 4284319, 4284686, 4284872, 4285730, 4285730, 4285731, 
4286061, 4286061, 4286061, 4286061, 4286978, 4286978, 4288356, 
4288356, 4288470, 4288470, 4289517, 4289517, 4289522, 4289522, 
4289859, 4289859, 4290100, 4290586, 4290586, 4290843, 4290843
), inventors = c("03868264-1", "04070526-2", "04204850-2", "03862079-1", 
"03902910-1", "04253719-1", "03862861-2", "04262069-2", "04262069-3", 
"04216691-1", "04128775-2", "04273567-1", "04273567-2", "04273567-1", 
"04273567-2", "04273567-2", "03879694-2", "03912524-1", "04116704-4", 
"04116704-5", "04273813-1", "03871022-1", "04057435-1", "04277285-2", 
"04277285-3", "03973976-1", "04179638-3", "03958052-2", "03878357-1", 
"03886401-1", "04280829-1", "04039504-1", "04281094-2", "04236065-1", 
"04236065-2", "04204851-1", "04282022-1", "04282035-1", "04169243-1", 
"04282578-2", "04282648-1", "04282648-2", "04282648-3", "04273567-2", 
"04063973-3", "04283439-1", "04283439-2", "04284213-1", "03964009-1", 
"04086089-2", "04229692-1", "03858548-1", "04046545-1", "04282035-1", 
"03868304-1", "04039387-1", "04039387-2", "04149936-2", "04126436-1", 
"04286978-2", "04288356-1", "04288356-2", "03979604-1", "03980915-3", 
"04126436-1", "04286978-2", "04126436-1", "04286978-2", "03900416-1", 
"04070526-2", "04290100-1", "03958052-2", "03978316-2", "04290843-1", 
"04290843-2"), pryear = c(1980L, 1980L, 1979L, 1980L, 1980L, 
1980L, 1980L, 1980L, 1980L, 1977L, 1980L, 1980L, 1980L, 1980L, 
1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1979L, 
1979L, 1979L, 1980L, 1980L, 1980L, 1979L, 1979L, 1980L, 1979L, 
1979L, 1978L, 1978L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 
1980L, 1980L, 1980L, 1977L, 1977L, 1977L, 1980L, 1980L, 1980L, 
1978L, 1979L, 1979L, 1980L, 1978L, 1978L, 1978L, 1978L, 1980L, 
1980L, 1980L, 1980L, 1978L, 1978L, 1980L, 1980L, 1980L, 1980L, 
1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L), pr_date = structure(c(3665, 
3723, 3364, 3681, 3672, 3679, 3728, 3728, 3728, 2818, 3676, 3693, 
3693, 3693, 3693, 3693, 3826, 3826, 3826, 3826, 3791, 3710, 3298, 
3298, 3298, 3701, 3701, 3686, 3298, 3298, 3784, 3623, 3623, 3261, 
3261, 3770, 3770, 3697, 3728, 3728, 3735, 3735, 3735, 3693, 2913, 
2913, 2913, 3840, 3693, 3812, 2934, 3564, 3564, 3697, 3169, 3169, 
3169, 3169, 3836, 3836, 3925, 3925, 3266, 3266, 3836, 3836, 3836, 
3836, 3805, 3805, 3735, 3686, 3686, 3701, 3701), class = "Date"), 
    inv_patc = c(1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 
    3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 2, 1, 1, 1, 1, 
    2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 1, 2, 1, 2, 
    1, 1, 2), iid = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
    11L, 12L, 13L, 12L, 13L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 
    20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 
    32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 13L, 41L, 42L, 
    43L, 44L, 45L, 46L, 47L, 48L, 49L, 35L, 50L, 51L, 52L, 53L, 
    54L, 55L, 56L, 57L, 58L, 59L, 54L, 55L, 54L, 55L, 60L, 2L, 
    61L, 25L, 62L, 63L, 64L)), class = c("data.table", "data.frame"
), row.names = c(NA, -75L), .internal.selfref = <pointer: 0x000000000f4a1ef0>)

根据我的计算和上面使用的代码,运行1100万行需要大约85个小时。 代码的一个改进

temp <- dlong[, {print(.GRP)
      x = pp[pnum == origpat, iid]; y = pp[pnum == ref.pat, iid]; .("overlap" = sum(x %in% y), "inv.jac" = jaccard3(x,y))}, by = idx]

可能不会循环遍历每个索引idx,而是首先循环到origpat(这样我们可以将x保持固定最多500行)然后进行固定origpat循环通过所有不同的ref.pat值。但是,我只知道如何使用循环执行此操作,然后data.table的整个点丢失。

有任何提高速度的建议吗?

由于

1 个答案:

答案 0 :(得分:0)

您可以试试以下内容。

  1. dlong加入pp origpat

  2. 然后在pp上再次将该结果与ref.pat结合,并计算您的jaccard函数中的分母。

  3. 然后在ppref.pat上用iid内部加入第一个结果来计算jaccard函数中的分子。

  4. 最后计算jaccard值。

  5. 示例数据:

    library(data.table)
    set.seed(0L)
    nDlong <- 11e6
    npat <- 230e3
    dlong <- data.table(
        idx=1:nDlong,
        origpat=sample(npat, nDlong, replace=TRUE),
        ref.pat=sample(npat, nDlong, replace=TRUE))
    pp <- data.table(pnum=1:npat, iid=sample(npat/5, npat, replace=TRUE))
    

    代码+时间:

    system.time({
        opat <- pp[dlong, on=c("pnum"="origpat"), allow.cartesian=TRUE]
        jacc <- pp[opat, on=c("pnum"="ref.pat"), allow.cartesian=TRUE][, 
            .(setlen=uniqueN(c(i.iid, iid))), by=.(idx)]
        olaps <- pp[opat, on=c("pnum"="ref.pat", "iid"="iid"), nomatch=0L][, 
            .(overlaps=.N), by=.(idx)]
    
        ans <- olaps[jacc, 
            .(idx, jaccVal=replace(overlaps, is.na(overlaps), 0) / setlen), 
            on=.(idx)]
    })
    
    #   user  system elapsed 
    # 154.10    1.14  163.09