Question

列表：

terms <- list(Item1 = c("a", "b", "c", "d"),
              Item2 = c("a", "e", "f", "g"),
              Item3 = c("b", "e", "h", "i"),
              Item4 = c("j", "k"))

我想获得列表中每对项目之间共享字母的数量。因此预期的输出是：

     [,1] [,2] [,3] [,4]
[1,]    4    1    1    0
[2,]    1    4    1    0
[3,]    1    1    4    0
[4,]    0    0    0    2

从之前的StackOverflow回答中，我找到了一个可能的解决方案：

overlapLength <- function(x, y) mapply(function(x, y) 
  length(intersect(x, y)), terms[x], terms[y])
s <- seq_along(terms)
outer(s, s, overlapLength)

但这对我的名单来说非常慢，这是非常大的（约9,000项）。

有更快的方法吗？

感谢大家的意见。我用我列表中的前100个项目计算了所有答案。

> system.time(f_crossprod(go))
   user  system elapsed 
  0.024   0.001   0.025 
> system.time(f_crossprod2(go))
   user  system elapsed 
  0.007   0.000   0.008 
> system.time(f_mapply(go))
   user  system elapsed 
  2.018   0.032   2.059 
> system.time(f_outer(go))
   user  system elapsed 
  1.950   0.016   1.979 
> system.time(f_combn(go))
   user  system elapsed 
  1.056   0.005   1.062 
> system.time(f_Rcpp(go))
   user  system elapsed 
163.236  84.226 249.240

然后我将outer和Matrix::crossprod解决方案与~9,000个元素的整个列表进行了对比。 outer解决方案在大约55分钟内完成。 Matrix::crossprod解决方案在大约0.1秒内运行！

我可能在执行Rcpp函数时出错了。但是，@ alexis_laz如果你发表评论我会接受它。

顺便说一句，抱歉我不清楚，我对对角线的价值不感兴趣。

Answer 1

我们可以使用outer

outer(names(terms), names(terms), FUN = function(x,y) 
              lengths(Map(intersect, terms[x], terms[y])))
#     [,1] [,2] [,3] [,4]
#[1,]    4    1    1    0
#[2,]    1    4    1    0
#[3,]    1    1    4    0
#[4,]    0    0    0    2

或更紧凑

outer(terms, terms, FUN = function(...) lengths(Map(intersect, ...)))
#      Item1 Item2 Item3 Item4
#Item1     4     1     1     0
#Item2     1     4     1     0
#Item3     1     1     4     0
#Item4     0     0     0     2

我们也可以在Rcpp中实现这一点。以下是test1.cpp文件

#include <Rcpp.h>
#include <math.h>

using namespace Rcpp;
//[[Rcpp::export]]

List foo(List xs) {
    List x(xs);
    List x1 = Rcpp::clone(xs);
    List y1 = Rcpp::clone(xs);
    int n = x1.size();



    NumericVector res;


    for( int i=0; i<n; i++){
        for(int j=0; j<n; j++){
         CharacterVector xd = x1[i];
         CharacterVector yd = y1[j];

        res.push_back(intersect(xd, yd).length());
        }
    }
    return wrap(res) ;

我们使用

在R中调用它

library(Rcpp)
sourceCpp("test1.cpp")
`dim<-`(unlist(foo(terms)), c(4, 4))
#     [,1] [,2] [,3] [,4]
#[1,]    4    1    1    0
#[2,]    1    4    1    0
#[3,]    1    1    4    0
#[4,]    0    0    0    2

基准

除了上述功能之外，我们还添加了另一个版本RcppEigen已发布here

n <- 100
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10), 
         replace = TRUE)), paste0("Item", seq_len(n)))

library(Matrix)
library(inline)
library(Rcpp)

alexis1 <- function() {crossprod(table(stack(terms1)))}
alexis2 <-  function() {Matrix::crossprod(xtabs( ~ values + ind, 
            stack(terms1), sparse = TRUE)) }

akrun1 <- function(){outer(terms1, terms1, FUN = function(...) lengths(Map(intersect, ...)))}
akrun2 <- function() {`dim<-`(unlist(foo(terms1)), c(n, n))}
akrun3 <- function() {tbl <- table(stack(terms1))
                      funCPr(tbl, tbl)[[1]]}

db <- function() {do.call(rbind, lapply(1:length(terms1), function(i)
    sapply(terms1, function(a)
        sum(unlist(terms1[i]) %in% unlist(a)))))} 
lmo <- function() { setNames(data.frame(t(combn(names(terms1), 2)),
                      combn(seq_along(terms1), 2,
                            function(x) length(intersect(terms1[[x[1]]], terms1[[x[2]]])))),
         c("col1", "col2", "counts"))}

<{1}}处 n 的基准输出

library(microbenchmark) microbenchmark(alexis1(), alexis2(), akrun1(), akrun2(),akrun3(), db(), lmo(), unit = "relative", times = 10L) #Unit: relative # expr min lq mean median uq max neval cld # alexis1() 1.035975 1.032101 1.031239 1.010472 1.044217 1.129092 10 a # alexis2() 3.896928 3.656585 3.461980 3.386301 3.335469 3.288161 10 a # akrun1() 218.456708 207.099841 198.391784 189.356065 188.542712 214.415661 10 d # akrun2() 84.239272 79.073087 88.594414 75.719853 78.277769 129.731990 10 b # akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a # db() 86.921164 82.201117 80.358097 75.113471 73.311414 105.761977 10 b # lmo() 125.128109 123.203318 118.732911 113.271352 113.164333 138.075212 10 c

稍高 n

将 n 设置为n <- 200 set.seed(24) terms1 <- setNames(replicate(n, sample(letters, sample(10), replace = TRUE)), paste0("Item", seq_len(n))) microbenchmark(alexis1(), alexis2(), akrun3(), db(), unit = "relative", times = 10L) #Unit: relative # expr min lq mean median uq max neval cld # alexis1() 1.117234 1.164198 1.181280 1.166070 1.230077 1.229899 10 a # alexis2() 3.428904 3.425942 3.337112 3.379675 3.280729 3.164852 10 b # akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a # db() 219.971285 219.577403 207.793630 213.232359 196.122420 187.433635 10 c

检查输出

n <- 9000
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10), 
                replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(),alexis2(),  akrun3(), unit = "relative", times = 10L)
#Unit: relative
#     expr      min       lq     mean   median       uq      max neval cld
# alexis1() 2.048708 2.021709 2.009396 2.085750 2.141060 1.767329    10  b 
# alexis2() 3.520220 3.518339 3.419368 3.616512 3.515993 2.952927    10   c
#  akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10 a

基于@alexis_laz的评论，我们还包含3个函数来替换res1 <- alexis1() res2 <- akrun3() res3 <- alexis2() all.equal(res1, res2, check.attributes = FALSE) #[1] TRUE all.equal(res1, as.matrix(res3), check.attributes = FALSE) #[1] TRUE部分，以比较table/stack

的 n 的效率

并且基准是

alexis3 <- function() {
    unlt = unlist(terms1, use.names = FALSE)
    u = unique(unlt)
    tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
    tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
    crossprod(tab, tab)
    }

alexis4 <- function() {
        unlt = unlist(terms1, use.names = FALSE)
        u = unique(unlt)

       tab = sparseMatrix(x = 1L, i = match(unlt, u),
           j = rep(seq_along(terms1), lengths(terms1)), dimnames = list(u, names(terms1)))

       Matrix::crossprod(tab, tab, sparse = TRUE)
       }

akrun4 <- function() {
        unlt = unlist(terms1, use.names = FALSE)
        u = unique(unlt)
        tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
        tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
        funCPr(tab, tab)[[1]]
      }

Answer 2

这使用combn生成术语组合的data.frame，其中术语的值不同。 setNames添加了变量名称。

result <- setNames(data.frame(t(combn(names(terms), 2)),
                      combn(seq_along(terms), 2,
                            function(x) length(intersect(terms[[x[1]]], terms[[x[2]]])))),
         c("col1", "col2", "counts"))

返回

result
   col1  col2 counts
1 Item1 Item2      1
2 Item1 Item3      1
3 Item1 Item4      0
4 Item2 Item3      1
5 Item2 Item4      0
6 Item3 Item4      0

如果需要，您可以使用lengths获取自己的期限长度，然后使用rbind结果

temp <- lengths(terms)
rbind(result, data.frame(col1=names(temp), col2=names(temp), counts=temp, row.names=NULL))
    col1  col2 counts
1  Item1 Item2      1
2  Item1 Item3      1
3  Item1 Item4      0
4  Item2 Item3      1
5  Item2 Item4      0
6  Item3 Item4      0
7  Item1 Item1      4
8  Item2 Item2      4
9  Item3 Item3      4
10 Item4 Item4      2

Answer 3

我不确定这是否更快或更有效，但确实很有趣。

do.call(rbind, lapply(1:length(terms), function(i)
    sapply(terms, function(a)
        sum(unlist(terms[i]) %in% unlist(a)))))
#     Item1 Item2 Item3 Item4
#[1,]     4     1     1     0
#[2,]     1     4     1     0
#[3,]     1     1     4     0
#[4,]     0     0     0     2

获取列表

3 个答案:

基准