Question

长时间潜伏，第一次问问。

我试图计算两组物品之间的共同物品。对于20M +项目数据集。示例数据如下所示。

#serially numbered items
parents <- rep(1:10000)

#generate rnorm # of children items
numchild <- round(rnorm(10000, mean=30, sd=10))

#fill the parent-child list
parent_child <- list()
for (x in 1:length(parents)){
  if (numchild[x]>0){
    f1 <- sample(1:length(parents), size=numchild[x])
    f2 <- list(parents[f1])
    parent_child <- c(parent_child, f2)
  }
  else {
    parent_child <- c(parent_child, list(x+1))    #if numchild=0, make up something
  }
}

这就是我想要做的事情：说父项＃1有5个子项 - 1,2,3,4,5，父项＃2有3个子项 - 4,10,22。

我想计算每个（parent_i，parent_j）组合的长度（交集）。在上面的例子中，它将是1个共同项 - 4。

我这样做是为了10M +父项，平均有15-20个儿童项目，范围为（0,100）。所以这是一个10M x 10M的项目矩阵。

我有一个foreach循环，我正在测试一个较小的子集，但它不能完全扩展到整个数据集（64核心机器具有256GB RAM）。通过下面的循环，我已经只计算了用户 - 用户矩阵的一半 - ＆gt; （parent_i，parent_j）与（parent_j，parent_i）相同。为此目的。

#small subset
a <- parent_child[1:1000]

outerresults <- foreach (i = 1:(length(a)), .combine=rbind, .packages=c('foreach','doParallel')) %dopar% {
  b <- a[[i]]
  rest <- a[i+1:length(a)]

  foreach (j = 1:(length(rest)), .combine=rbind) %dopar% {
    common <- length(intersect(b, rest[[j]]))
    if (common > 0) {g <- data.frame(u1=i, u2=j+1, common)}
  }  
}

我一直在试验这方面的变化（使用Reduce，将父母子女存储在daataframe等中）但是没有多少运气。

有没有办法实现这种规模？

Answer 1

我改变了分裂，以便我们有一个孩子与父母的关系

len <- sapply(parent_child, length)
child_parent <- split(rep(seq_along(parent_child), len), 
                      unlist(parent_child, use.names=FALSE))

类似下面这样的内容构建一个字符串，其中包含所有孩子共享孩子的父母对

keep <- sapply(child_parent, length) > 1
int <- lapply(child_parent[keep], function(x) {
    x <- combn(sort(x), 2)
    paste(x[1,], x[2,], sep=".")
})

和理货

table(unlist(int, use.names=FALSE))

或者更快一点

xx <- unlist(int, use.names=FALSE)
nms <- unique(xx)
cnt <- match(xx, nms)
setNames(tabulate(cnt, length(nms), nms)

的

f1 <- function(parent_child) {
    len <- sapply(parent_child, length)
    child_parent <- split(rep(seq_along(parent_child), len), 
                          unlist(parent_child, use.names=FALSE))

    keep <- sapply(child_parent, length) > 1
    int <- lapply(child_parent[keep], function(x) {
        x <- combn(sort(x), 2)
        paste(x[1,], x[2,], sep=".")
    })

    xx <- unlist(int, use.names=FALSE)
    nms <- unique(xx)
    cnt <- match(xx, nms)
    setNames(tabulate(cnt, length(nms)), nms)
}

with（这适用于所有10000个父子元素）

> system.time(ans1 <- f1(parent_child))
   user  system elapsed 
 14.625   0.012  14.668 
> head(ans1)
542.1611 542.1832 542.2135 542.2435 542.2527 542.2806 
       1        1        1        1        1        1

我不确定这是否会真正扩展到你所谈论的问题的大小 - 但它是每个孩子的父母数量的多项式。

加速的一种可能性是“记忆”组合计算，使用参数的长度作为“键”并将组合存储为“值”。这会将调用combn的次数减少为child_parent的唯一元素长度数。

combn1 <- local({
    memo <- new.env(parent=emptyenv())
    function(x) {
        key <- as.character(length(x))
        if (!exists(key, memo))
            memo[[key]] <- t(combn(length(x), 2))
        paste(x[memo[[key]][,1]], x[memo[[key]][,2]], sep=".")
    }
})

f2 <- function(parent_child) {
    len <- sapply(parent_child, length)
    child_parent <- split(rep(seq_along(parent_child), len), 
                          unlist(parent_child, use.names=FALSE))

    keep <- sapply(child_parent, length) > 1
    int <- lapply(child_parent[keep], combn1)

    xx <- unlist(int, use.names=FALSE)
    nms <- unique(xx)
    cnt <- match(xx, nms)
    setNames(tabulate(cnt, length(nms)), nms)
}

有点帮助

>     system.time(ans2 <- f2(parent_child))
   user  system elapsed 
  5.337   0.000   5.347 
>     identical(ans1, ans2)
[1] TRUE

缓慢的部分现在是paste

>     Rprof(); ans2 <- f2(parent_child); Rprof(NULL); summaryRprof()
$by.self
                 self.time self.pct total.time total.pct
"paste"               3.92    73.41       3.92     73.41
"match"               0.74    13.86       0.74     13.86
"unique.default"      0.40     7.49       0.40      7.49
"as.character"        0.08     1.50       0.08      1.50
"unlist"              0.08     1.50       0.08      1.50
"combn"               0.06     1.12       0.06      1.12
"lapply"              0.02     0.37       4.00     74.91
"any"                 0.02     0.37       0.02      0.37
"setNames"            0.02     0.37       0.02      0.37

$by.total
...

我们可以通过将具有共享子ID的父代码编码为单个整数来避免这种情况;由于浮点数用R表示的方式，这将是精确的直到大约2 ^ 21

encode <- function(x, y, n)
    (x - 1) * (n + 1) + y
decode <- function(z, n)
    list(x=ceiling(z / (n + 1)), y = z %% (n + 1))

将combn1和f2函数调整为

combn2 <- local({
    memo <- new.env(parent=emptyenv())
    function(x, encode_n) {
        key <- as.character(length(x))
        if (!exists(key, memo))
            memo[[key]] <- t(combn(length(x), 2))
        encode(x[memo[[key]][,1]], x[memo[[key]][,2]], encode_n)
    }
})

f3 <- function(parent_child) {
    encode_n <- length(parent_child)
    len <- sapply(parent_child, length)
    child_parent <-
        unname(split(rep(seq_along(parent_child), len), 
                     unlist(parent_child, use.names=FALSE)))

    keep <- sapply(child_parent, length) > 1
    int <- lapply(child_parent[keep], combn2, encode_n)

    id <- unlist(int, use.names=FALSE)
    uid <- unique(xx)
    n <- tabulate(match(xx, uid), length(uid))
    do.call(data.frame, c(decode(uid, encode_n), list(n=n)))
}

导致

> system.time(f3(parent_child))
   user  system elapsed 
  2.140   0.000   2.146

这与jlhoward的修订答案相比非常有利（注意前一行中的时间是10,000个父子关系）

> system.time(result.3 <- do.call("rbind",lapply(1:99,gg)))
   user  system elapsed 
  2.465   0.000   2.468
> system.time(f3(parent_child[1:99]))
   user  system elapsed 
  0.016   0.000   0.014

以更合理的方式进行扩展。

对于它的价值，数据生成例程位于Patrick Burn的 R Inferno 的第二个循环中，使用“复制 - 追加”算法而不是预先分配空间并填充它通过将for循环体作为函数并使用lapply来避免这种情况。通过先前修复问题

，避免在for循环中需要复杂的条件

numchild <- round(rnorm(10000, mean=30, sd=10))
numchild[numchild < 0] <- sample(numchild[numchild > 0], sum(numchild < 0))

或通过从生成正整数值的分布（rpois，rbinom）中采样。然后数据生成

n_parents <- 10000
numchild <- round(rnorm(n_parents, mean=30, sd=10))
numchild[numchild < 0] <- sample(numchild[numchild > 0], sum(numchild < 0))
parent_child <- lapply(numchild, sample, x=n_parents)

Answer 2

这是另一种方法，比我之前的答案快10倍，并且比原始代码快17倍（也更简单）：

ff <- function(u2, u1, a) {
  common <- length(intersect(a,parent_child[[u2]]))
  if (common>0) {return(data.frame(u1,u2,common))}
}

gg <- function(u1) {
  a <- parent_child[[u1]]
  do.call("rbind",lapply((u1+1):100,ff,u1,a))
}

system.time(result.3 <- do.call("rbind",lapply(1:99,gg)))
   user  system elapsed 
   1.04    0.00    1.03

result.3与之前回答的result.2相同：

max(abs(result.3-result.2))
[1] 0

Answer 3

嗯，一点点改进（我认为）：

原始代码（包含在函数调用中）：

f = function(n) {
  #small subset
  a <- parent_child[1:n]

  outerresults <- foreach (i = 1:(length(a)), 
                           .combine=rbind,
                           .packages=c('foreach','doParallel')) %dopar% {
    b <- a[[i]]
    rest <- a[i+1:length(a)]

    foreach (j = 1:(length(rest)), .combine=rbind) %dopar% {
      common <- length(intersect(b, rest[[j]]))
      if (common > 0) {g <- data.frame(u1=i, u2=j+1, common)}
    }  
  }  
  return(outerresults)
}

修改后的代码：

g <- function(n) {
  a <- parent_child[1:n]

  outerresults <- foreach (i = 1:n, 
                           .combine=rbind, 
                           .packages=c('foreach','doParallel')) %dopar% {
    b <- a[[i]]

    foreach (j = (i):n, .combine=rbind) %dopar% {
      if (i!=j) {
        c <- a[[j]]
        common <- length(intersect(b, c))
        if (common > 0) {g <- data.frame(u1=i, u2=j, common)}
      }
    }  
  }
  return(outerresults)
}

基准：

system.time(result.old<-f(100))
   user  system elapsed 
  17.21    0.00   17.33 
system.time(result.new<-g(100))
   user  system elapsed 
  10.42    0.00   10.47

由于不同的方法，u2的编号略有不同，但两者都产生相同的匹配矢量：

max(abs(result.old$common-result.new$common))
[1] 0

我尝试使用替换intersect(...)的数据表连接，但它实际上要慢得多（!!）

如何在R中向量化或以其他方式加速这种循环逻辑？

3 个答案: