我有两种输入,格式如下:
domains = list(
O60925 = "PF01920",
P01130 = c("PF07645", "PF00057", "PF00058"),
Q14764 = c("PF11978", "PF01505"),
Q9BX68 = "PF01230",
P46777 = "PF14204")
interactions = structure(c(1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0), .Dim = c(8L, 8L), .Dimnames = list(c("PF01920",
"PF07645", "PF00057", "PF00058", "PF11978", "PF01505", "PF01230",
"PF14204"), c("PF01920", "PF07645", "PF00057", "PF00058", "PF11978",
"PF01505", "PF01230", "PF14204")))
PF01920 PF07645 PF00057 PF00058 PF11978 PF01505 PF01230 PF14204
PF01920 1 0 0 0 0 0 1 0
PF07645 0 1 0 1 0 0 0 0
PF00057 0 0 1 1 0 0 0 0
PF00058 0 1 1 1 0 0 0 0
PF11978 0 0 0 0 1 0 0 0
PF01505 0 0 0 0 0 1 0 0
PF01230 1 0 0 0 0 0 1 0
PF14204 0 0 0 0 0 0 0 0
我想计算以下输出,其中每个单元格中的整数表示interactions
列表中每对名称的domains
矩阵中所有单元格的总和。
O60925 P01130 Q14764 Q9BX68 P46777
O60925 1 0 0 1 0
P01130 0 7 0 0 0
Q14764 0 0 2 0 0
Q9BX68 1 0 0 1 0
P46777 0 0 0 0 0
上下文是我有一个蛋白质列表(domains
列表的名称)及其Pfam域名(domains
列表中的条目),以及已知Pfam域名的矩阵 - Pfam域交互(interactions
矩阵)。我想总结每个蛋白质对的已知域 - 域相互作用的总数。
实际上domains
列表和interactions
矩阵都比这些矩阵大得多,所以我想确定一种生成此结果矩阵的快速方法。但是,到目前为止,我能够提出的唯一解决方案是apply
循环:
proteins = names(domains)
result = matrix(0, nrow = length(proteins), ncol = length(proteins),
dimnames = list(proteins, proteins))
combinations = tidyr::crossing(proteins, proteins)
n_interactions = apply(combinations, 1, function(row) {
domains1 = domains[[row[1]]]
domains2 = domains[[row[2]]]
sum(interactions[as.matrix(crossing(domains1, domains2))])
})
result[as.matrix(combinations)] = n_interactions
我确信必须有更快的方法来做到这一点,但是如何?
答案 0 :(得分:2)
假设你的矩阵像示例一样排序,你可以聪明地使用一些矩阵代数:
columnBuilder <- function(m,l,n){
rep.int(c(0,1,0),
c(l,n,m-n-l))
}
matrixBuilder <- function(domainList){
groupSizes <- sapply(domains,length)
leadingZeros <- cumsum(c(0,groupSizes))
m <- sum(groupSizes)
sapply(seq_along(groupSizes),
function(i){
columnBuilder(m,leadingZeros[[i]],groupSizes[[i]])
})
}
magicFunction <- function(interactionsM, domainL){
magicMatrix <- matrixBuilder(domainL)
output <- t(magicMatrix) %*% interactionsM %*% magicMatrix
colnames(output) <- rownames(output) <- names(domainL)
output
}
magicFunction(interactions, domains)
O60925 P01130 Q14764 Q9BX68 P46777
O60925 1 0 0 1 0
P01130 0 7 0 0 0
Q14764 0 0 2 0 0
Q9BX68 1 0 0 1 0
P46777 0 0 0 0 0
关于这个很酷的事情是
1.您应始终能够对矩阵进行排序以使用此方法
2.这不应该是内存密集型的
3.您可以修改它以仅构建magicMatrix
的单个列,如同在较大的情况下相乘,您将最终只得到最终输出中的单个列。您不必运行整个算法来获取您想要查看的列!至于bencmarks:
microbenchmark::microbenchmark(
OP = {
proteins = names(domains)
result = matrix(0, nrow = length(proteins), ncol = length(proteins),
dimnames = list(proteins, proteins))
combinations = tidyr::crossing(proteins, proteins)
n_interactions = apply(combinations, 1, function(row) {
domains1 = domains[[row[1]]]
domains2 = domains[[row[2]]]
sum(interactions[as.matrix(tidyr::crossing(domains1, domains2))])
})
result[as.matrix(combinations)] = n_interactions
},
privefl = {
n <- length(domains)
res <- matrix(nrow = n, ncol = n)
res[] <- purrr::pmap_dbl(expand.grid(domains, domains),
function(Var1,Var2){sum(interactions[Var1, Var2])})
colnames(res) <- rownames(res) <- names(domains)
},
matrixAlgebra = {
magicFunction(interactions, domains)
},
times = 10
)
Unit: microseconds
expr min lq mean median uq max neval
OP 18996.486 20218.043 33483.5307 21058.912 22152.479 143394.733 10
privefl 406.579 424.811 467.1096 448.513 475.861 642.503 10
matrixAlgebra 72.200 95.902 123.1771 111.946 137.471 261.085 10
答案 1 :(得分:2)
你可以这样做:
n <- length(domains)
res <- matrix(nrow = n, ncol = n)
res[] <- purrr::pmap_dbl(expand.grid(domains, domains),
~ sum(interactions[.x, .y]))
colnames(res) <- rownames(res) <- names(domains)
事实上,这与你所做的没什么不同。
基准:
microbenchmark::microbenchmark(
OP = {
proteins = names(domains)
result = matrix(0, nrow = length(proteins), ncol = length(proteins),
dimnames = list(proteins, proteins))
combinations = tidyr::crossing(proteins, proteins)
n_interactions = apply(combinations, 1, function(row) {
domains1 = domains[[row[1]]]
domains2 = domains[[row[2]]]
sum(interactions[as.matrix(crossing(domains1, domains2))])
})
result[as.matrix(combinations)] = n_interactions
},
privefl = {
n <- length(domains)
res <- matrix(nrow = n, ncol = n)
res[] <- purrr::pmap_dbl(expand.grid(domains, domains),
~ sum(interactions[.x, .y]))
colnames(res) <- rownames(res) <- names(domains)
},
times = 10
)
结果:
Unit: microseconds
expr min lq mean median uq max neval
OP 208685.225 209913.891 231506.172 210817.264 213071.475 416724.50 10
privefl 262.885 281.426 1580.779 306.092 396.975 12842.56 10