Question

我有两种输入，格式如下：

domains = list(
    O60925 = "PF01920",
    P01130 = c("PF07645", "PF00057", "PF00058"),
    Q14764 = c("PF11978", "PF01505"),
    Q9BX68 = "PF01230",
    P46777 = "PF14204")

interactions = structure(c(1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 
0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 
0, 0, 0, 0, 0), .Dim = c(8L, 8L), .Dimnames = list(c("PF01920", 
"PF07645", "PF00057", "PF00058", "PF11978", "PF01505", "PF01230", 
"PF14204"), c("PF01920", "PF07645", "PF00057", "PF00058", "PF11978", 
"PF01505", "PF01230", "PF14204")))

        PF01920 PF07645 PF00057 PF00058 PF11978 PF01505 PF01230 PF14204
PF01920       1       0       0       0       0       0       1       0
PF07645       0       1       0       1       0       0       0       0
PF00057       0       0       1       1       0       0       0       0
PF00058       0       1       1       1       0       0       0       0
PF11978       0       0       0       0       1       0       0       0
PF01505       0       0       0       0       0       1       0       0
PF01230       1       0       0       0       0       0       1       0
PF14204       0       0       0       0       0       0       0       0

我想计算以下输出，其中每个单元格中的整数表示interactions列表中每对名称的domains矩阵中所有单元格的总和。

       O60925 P01130 Q14764 Q9BX68 P46777
O60925      1      0      0      1      0
P01130      0      7      0      0      0
Q14764      0      0      2      0      0
Q9BX68      1      0      0      1      0
P46777      0      0      0      0      0

上下文是我有一个蛋白质列表（domains列表的名称）及其Pfam域名（domains列表中的条目），以及已知Pfam域名的矩阵 - Pfam域交互（interactions矩阵）。我想总结每个蛋白质对的已知域 - 域相互作用的总数。

实际上domains列表和interactions矩阵都比这些矩阵大得多，所以我想确定一种生成此结果矩阵的快速方法。但是，到目前为止，我能够提出的唯一解决方案是apply循环：

proteins = names(domains)
result = matrix(0, nrow = length(proteins), ncol = length(proteins),
dimnames = list(proteins, proteins))
combinations = tidyr::crossing(proteins, proteins)
n_interactions = apply(combinations, 1, function(row) {
  domains1 = domains[[row[1]]]
  domains2 = domains[[row[2]]]
  sum(interactions[as.matrix(crossing(domains1, domains2))])
})
result[as.matrix(combinations)] = n_interactions

我确信必须有更快的方法来做到这一点，但是如何？

Answer 1

假设你的矩阵像示例一样排序，你可以聪明地使用一些矩阵代数：

columnBuilder <- function(m,l,n){
  rep.int(c(0,1,0),
          c(l,n,m-n-l))
}


matrixBuilder <- function(domainList){
  groupSizes <- sapply(domains,length)
  leadingZeros <- cumsum(c(0,groupSizes))
  m <- sum(groupSizes)

  sapply(seq_along(groupSizes),
         function(i){
           columnBuilder(m,leadingZeros[[i]],groupSizes[[i]])
         })
}

magicFunction <- function(interactionsM, domainL){
  magicMatrix <- matrixBuilder(domainL)

  output <- t(magicMatrix) %*% interactionsM %*% magicMatrix
  colnames(output) <- rownames(output) <- names(domainL)
  output

}

magicFunction(interactions, domains)

           O60925 P01130 Q14764 Q9BX68 P46777
O60925      1      0      0      1      0
P01130      0      7      0      0      0
Q14764      0      0      2      0      0
Q9BX68      1      0      0      1      0
P46777      0      0      0      0      0

关于这个很酷的事情是 1.您应始终能够对矩阵进行排序以使用此方法 2.这不应该是内存密集型的 3.您可以修改它以仅构建magicMatrix的单个列，如同在较大的情况下相乘，您将最终只得到最终输出中的单个列。您不必运行整个算法来获取您想要查看的列！至于bencmarks：

microbenchmark::microbenchmark(
  OP = {
    proteins = names(domains)
    result = matrix(0, nrow = length(proteins), ncol = length(proteins),
                    dimnames = list(proteins, proteins))
    combinations = tidyr::crossing(proteins, proteins)
    n_interactions = apply(combinations, 1, function(row) {
      domains1 = domains[[row[1]]]
      domains2 = domains[[row[2]]]
      sum(interactions[as.matrix(tidyr::crossing(domains1, domains2))])
    })
    result[as.matrix(combinations)] = n_interactions
  },
  privefl = {
    n <- length(domains)
    res <- matrix(nrow = n, ncol = n)
    res[] <- purrr::pmap_dbl(expand.grid(domains, domains),
                             function(Var1,Var2){sum(interactions[Var1, Var2])}) 
    colnames(res) <- rownames(res) <- names(domains)
  },
  matrixAlgebra = {
    magicFunction(interactions, domains)
  },


  times = 10
)



Unit: microseconds
          expr       min        lq       mean    median        uq        max neval
            OP 18996.486 20218.043 33483.5307 21058.912 22152.479 143394.733    10
       privefl   406.579   424.811   467.1096   448.513   475.861    642.503    10
 matrixAlgebra    72.200    95.902   123.1771   111.946   137.471    261.085    10

Answer 2

你可以这样做：

n <- length(domains)
res <- matrix(nrow = n, ncol = n)
res[] <- purrr::pmap_dbl(expand.grid(domains, domains), 
                         ~ sum(interactions[.x, .y])) 
colnames(res) <- rownames(res) <- names(domains)

事实上，这与你所做的没什么不同。

基准：

microbenchmark::microbenchmark(
  OP = {
    proteins = names(domains)
    result = matrix(0, nrow = length(proteins), ncol = length(proteins),
                    dimnames = list(proteins, proteins))
    combinations = tidyr::crossing(proteins, proteins)
    n_interactions = apply(combinations, 1, function(row) {
      domains1 = domains[[row[1]]]
      domains2 = domains[[row[2]]]
      sum(interactions[as.matrix(crossing(domains1, domains2))])
    })
    result[as.matrix(combinations)] = n_interactions
  },
  privefl = {
    n <- length(domains)
    res <- matrix(nrow = n, ncol = n)
    res[] <- purrr::pmap_dbl(expand.grid(domains, domains), 
                             ~ sum(interactions[.x, .y])) 
    colnames(res) <- rownames(res) <- names(domains)
  },
  times = 10
)

结果：

Unit: microseconds
    expr        min         lq       mean     median         uq       max neval
      OP 208685.225 209913.891 231506.172 210817.264 213071.475 416724.50    10
 privefl    262.885    281.426   1580.779    306.092    396.975  12842.56    10

通过行

2 个答案: