我正在寻找提高以下计算效率的方法
jaccard3 <- function(a,b) sum(a %in% b)/length(unique(c(a,b)))
temp <- dlong[, {print(.GRP)
x = pp[pnum == origpat, iid]; y = pp[pnum == ref.pat, iid]; .("overlap" = sum(x %in% y), "inv.jac" = jaccard3(x,y))}, by = idx]
我依赖的两个data.tables如下所示
sapply(dlong, class)
origpat ref.pat jac originv refinv idx
"numeric" "numeric" "numeric" "integer" "integer" "integer"
和
sapply(pp, class)
pnum inventors pryear pr_date inv_patc iid
"integer" "character" "integer" "Date" "numeric" "integer"
因此,我的目标是在origpat和ref.pat中查找发明者(它们存储在pp
中作为inventors
列中的字符存储,并作为整数存储在iid
中[我假设整数匹配应该比字符匹配更快。
dlong
的结构使得origpat
和ref.pat
的值会出现多次(origpat
可能连续出现500次)但是origpat
和ref.pat
是唯一的。 dlong
有大约1,100万行,每一行由唯一的origpat ref.pat
组合定义,而pp
有大约230,000行,每行由唯一的pnum iid
组合定义。
我添加了一个非常简短的数据摘录[评论后更新]
dput(dlong[1:100])
structure(list(origpat = c(4246034, 4246034, 4246034, 4246034,
4246034, 4246034, 4246034, 4246034, 4247592, 4247592, 4247592,
4247592, 4248614, 4248614, 4248614, 4248614, 4248614, 4248614,
4248614, 4248614, 4248761, 4251278, 4251278, 4251278, 4251278,
4251278, 4251278, 4251278, 4251278, 4251278, 4253719, 4253719,
4267550, 4273567, 4273567, 4273567, 4273567, 4273567, 4273567,
4273568, 4273568, 4273568, 4273568, 4273568, 4273568, 4273569,
4273569, 4273569, 4273569, 4273569, 4273569, 4273586, 4273586,
4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 4273586,
4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 4273586,
4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 4273586,
4273586, 4273586, 4273586, 4273586, 4273586, 4273586, 4273586,
4273813, 4273813, 4277285, 4277286, 4277286, 4277286, 4277286,
4277286, 4277286, 4277286, 4277286, 4277286, 4277286, 4277286,
4277286, 4277286, 4277286, 4280829, 4280829), ref.pat = c(4293439,
4277285, 4328022, 4248614, 4297139, 4395271, 4294190, 4285730,
4431982, 4303735, 4309520, 4315970, 4358181, 4478622, 4312654,
4293439, 4406517, 4478623, 4277285, 4328022, 4377675, 4293439,
4277285, 4328022, 4248614, 4297139, 4395271, 4294190, 4285730,
4246034, 4491379, 4534098, 4338590, 4358181, 4478622, 4312654,
4406517, 4478623, 4248614, 4358181, 4478622, 4312654, 4406517,
4478623, 4248614, 4358181, 4478622, 4312654, 4406517, 4478623,
4248614, 4293439, 4277285, 4328022, 4248614, 4297139, 4395271,
4294190, 4285730, 4246034, 4251278, 4339174, 4282035, 4285731,
4311504, 4277286, 4298389, 4362819, 4360567, 4295872, 4302235,
4331770, 4282022, 4304584, 4344670, 4297193, 4292080, 4284686,
4385802, 4311529, 4314031, 4310651, 4534948, 4293439, 4293439,
4277285, 4328022, 4248614, 4297139, 4395271, 4294190, 4285730,
4246034, 4251278, 4339174, 4282035, 4285731, 4311504, 4358181,
4478622), jac = c(0.0714285714285714, 0.333333333333333, 0.5,
0.25, 0.2, 0.2, 0.25, 0.5, 0.0909090909090909, 0.0714285714285714,
0.25, 0.142857142857143, 0.125, 0.2, 0.25, 0.0714285714285714,
0.333333333333333, 0.333333333333333, 0.333333333333333, 0.5,
0.142857142857143, 0.0714285714285714, 0.333333333333333, 0.5,
0.25, 0.2, 0.2, 0.25, 0.5, 0.5, 1, 0.2, 0.0769230769230769, 0.125,
0.2, 0.25, 0.333333333333333, 0.333333333333333, 0.25, 0.125,
0.2, 0.25, 0.333333333333333, 0.333333333333333, 0.25, 0.125,
0.2, 0.25, 0.333333333333333, 0.333333333333333, 0.25, 0.0714285714285714,
0.333333333333333, 0.5, 0.25, 0.2, 0.2, 0.25, 0.5, 0.5, 1, 0.25,
0.5, 0.2, 0.2, 0.333333333333333, 1, 0.333333333333333, 0.166666666666667,
1, 0.25, 0.333333333333333, 0.5, 0.5, 0.125, 0.333333333333333,
1, 0.142857142857143, 0.0769230769230769, 0.333333333333333,
0.333333333333333, 0.1, 0.111111111111111, 0.0714285714285714,
0.0714285714285714, 0.333333333333333, 0.5, 0.25, 0.2, 0.2, 0.25,
0.5, 0.5, 1, 0.25, 0.5, 0.2, 0.2, 0.125, 0.2), idx = 1:100), sorted = "origpat", class = c("data.table",
"data.frame"), row.names = c(NA, -100L), .internal.selfref = <pointer: 0x000000000f4a1ef0>)
和
dput(pp[1:75])
structure(list(pnum = c(4246034, 4247592, 4248614, 4248761, 4251278,
4253719, 4262069, 4262069, 4262069, 4262575, 4267550, 4273567,
4273567, 4273568, 4273568, 4273569, 4273586, 4273586, 4273586,
4273586, 4273813, 4275419, 4277285, 4277285, 4277285, 4277286,
4277286, 4277322, 4279234, 4279234, 4280829, 4281094, 4281094,
4282003, 4282003, 4282022, 4282022, 4282035, 4282578, 4282578,
4282648, 4282648, 4282648, 4283218, 4283439, 4283439, 4283439,
4284213, 4284319, 4284686, 4284872, 4285730, 4285730, 4285731,
4286061, 4286061, 4286061, 4286061, 4286978, 4286978, 4288356,
4288356, 4288470, 4288470, 4289517, 4289517, 4289522, 4289522,
4289859, 4289859, 4290100, 4290586, 4290586, 4290843, 4290843
), inventors = c("03868264-1", "04070526-2", "04204850-2", "03862079-1",
"03902910-1", "04253719-1", "03862861-2", "04262069-2", "04262069-3",
"04216691-1", "04128775-2", "04273567-1", "04273567-2", "04273567-1",
"04273567-2", "04273567-2", "03879694-2", "03912524-1", "04116704-4",
"04116704-5", "04273813-1", "03871022-1", "04057435-1", "04277285-2",
"04277285-3", "03973976-1", "04179638-3", "03958052-2", "03878357-1",
"03886401-1", "04280829-1", "04039504-1", "04281094-2", "04236065-1",
"04236065-2", "04204851-1", "04282022-1", "04282035-1", "04169243-1",
"04282578-2", "04282648-1", "04282648-2", "04282648-3", "04273567-2",
"04063973-3", "04283439-1", "04283439-2", "04284213-1", "03964009-1",
"04086089-2", "04229692-1", "03858548-1", "04046545-1", "04282035-1",
"03868304-1", "04039387-1", "04039387-2", "04149936-2", "04126436-1",
"04286978-2", "04288356-1", "04288356-2", "03979604-1", "03980915-3",
"04126436-1", "04286978-2", "04126436-1", "04286978-2", "03900416-1",
"04070526-2", "04290100-1", "03958052-2", "03978316-2", "04290843-1",
"04290843-2"), pryear = c(1980L, 1980L, 1979L, 1980L, 1980L,
1980L, 1980L, 1980L, 1980L, 1977L, 1980L, 1980L, 1980L, 1980L,
1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1979L,
1979L, 1979L, 1980L, 1980L, 1980L, 1979L, 1979L, 1980L, 1979L,
1979L, 1978L, 1978L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L,
1980L, 1980L, 1980L, 1977L, 1977L, 1977L, 1980L, 1980L, 1980L,
1978L, 1979L, 1979L, 1980L, 1978L, 1978L, 1978L, 1978L, 1980L,
1980L, 1980L, 1980L, 1978L, 1978L, 1980L, 1980L, 1980L, 1980L,
1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L), pr_date = structure(c(3665,
3723, 3364, 3681, 3672, 3679, 3728, 3728, 3728, 2818, 3676, 3693,
3693, 3693, 3693, 3693, 3826, 3826, 3826, 3826, 3791, 3710, 3298,
3298, 3298, 3701, 3701, 3686, 3298, 3298, 3784, 3623, 3623, 3261,
3261, 3770, 3770, 3697, 3728, 3728, 3735, 3735, 3735, 3693, 2913,
2913, 2913, 3840, 3693, 3812, 2934, 3564, 3564, 3697, 3169, 3169,
3169, 3169, 3836, 3836, 3925, 3925, 3266, 3266, 3836, 3836, 3836,
3836, 3805, 3805, 3735, 3686, 3686, 3701, 3701), class = "Date"),
inv_patc = c(1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2,
3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 2, 1, 1, 1, 1,
2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 1, 2, 1, 2,
1, 1, 2), iid = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 13L, 12L, 13L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 13L, 41L, 42L,
43L, 44L, 45L, 46L, 47L, 48L, 49L, 35L, 50L, 51L, 52L, 53L,
54L, 55L, 56L, 57L, 58L, 59L, 54L, 55L, 54L, 55L, 60L, 2L,
61L, 25L, 62L, 63L, 64L)), class = c("data.table", "data.frame"
), row.names = c(NA, -75L), .internal.selfref = <pointer: 0x000000000f4a1ef0>)
根据我的计算和上面使用的代码,运行1100万行需要大约85个小时。 代码的一个改进
temp <- dlong[, {print(.GRP)
x = pp[pnum == origpat, iid]; y = pp[pnum == ref.pat, iid]; .("overlap" = sum(x %in% y), "inv.jac" = jaccard3(x,y))}, by = idx]
可能不会循环遍历每个索引idx
,而是首先循环到origpat
(这样我们可以将x保持固定最多500行)然后进行固定origpat
循环通过所有不同的ref.pat
值。但是,我只知道如何使用循环执行此操作,然后data.table的整个点丢失。
有任何提高速度的建议吗?
由于
答案 0 :(得分:0)
您可以试试以下内容。
在dlong
加入pp
origpat
。
然后在pp
上再次将该结果与ref.pat
结合,并计算您的jaccard函数中的分母。
然后在pp
和ref.pat
上用iid
内部加入第一个结果来计算jaccard函数中的分子。
最后计算jaccard值。
示例数据:
library(data.table)
set.seed(0L)
nDlong <- 11e6
npat <- 230e3
dlong <- data.table(
idx=1:nDlong,
origpat=sample(npat, nDlong, replace=TRUE),
ref.pat=sample(npat, nDlong, replace=TRUE))
pp <- data.table(pnum=1:npat, iid=sample(npat/5, npat, replace=TRUE))
代码+时间:
system.time({
opat <- pp[dlong, on=c("pnum"="origpat"), allow.cartesian=TRUE]
jacc <- pp[opat, on=c("pnum"="ref.pat"), allow.cartesian=TRUE][,
.(setlen=uniqueN(c(i.iid, iid))), by=.(idx)]
olaps <- pp[opat, on=c("pnum"="ref.pat", "iid"="iid"), nomatch=0L][,
.(overlaps=.N), by=.(idx)]
ans <- olaps[jacc,
.(idx, jaccVal=replace(overlaps, is.na(overlaps), 0) / setlen),
on=.(idx)]
})
# user system elapsed
# 154.10 1.14 163.09