在此CodeReview上的post中,我比较了生成大稀疏矩阵的几种方法。具体来说,我使用Matrix
中的R
包比较了密集结构和稀疏结构。我的问题是关于稀疏结构的后处理。我发现,当我尝试查找每k
列的行总和时,密集结构的性能优于稀疏结构。
微基准测试
ncols <- 100000
nrows <- 1000
col_probs <- runif(ncols, 0.001, 0.002)
mat1 <- spMat_dense(ncols=ncols,nrows=nrows,col_probs=col_probs)
mat2 <- spMat_dgC(ncols=ncols,nrows=nrows,col_probs = col_probs)
mat3 <- spMat_dgT(ncols=ncols,nrows=nrows,col_probs=col_probs)
k <- 50
starts <- seq(1, ncols, by=k)
microbenchmark::microbenchmark(sapply(starts, function(x) rowSums(mat1[, x:(x+k-1)])),
sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)])),
sapply(starts, function(x) Matrix::rowSums(mat3[, x:(x+k-1)])),
times=5L)
Unit: milliseconds
expr
sapply(starts, function(x) rowSums(mat1[, x:(x + k - 1)]))
sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x + k - 1)]))
sapply(starts, function(x) Matrix::rowSums(mat3[, x:(x + k - 1)]))
min lq mean median uq max
912.0453 947.0454 1041.365 965.4375 1007.311 1374.988
2097.4125 2208.0056 2566.575 2406.8450 2851.640 3268.970
13231.4790 13619.3818 13819.745 13675.6282 13923.803 14648.434
neval cld
5 a
5 b
5 c
我的猜测是sapply
函数在稠密矩阵上效果更好,因为它不需要在后台进行稀疏到稠密的转换。这些功能在下面发布。
问题 有没有一种方法可以提高上述稀疏结构的后处理速度?
功能
spMat_dense <- function(ncols,nrows,col_probs){
matrix(rbinom(nrows*ncols,1,col_probs),
ncol=ncols,byrow=T)
}
library(Matrix)
spMat_dgC <- function(ncols,nrows,col_probs){
#Credit to Andrew Guster (https://stackoverflow.com/a/56348978/4321711)
mat <- Matrix(0, nrows, ncols, sparse = TRUE) #blank matrix for template
i <- vector(mode = "list", length = ncols) #each element of i contains the '1' rows
p <- rep(0, ncols) #p will be cumsum no of 1s by column
for(r in 1:nrows){
row <- rbinom(ncols, 1, col_probs) #random row
p <- p + row #add to column identifier
if(any(row == 1)){
for (j in which(row == 1)){
i[[j]] <- c(i[[j]], r-1) #append row identifier
}
}
}
p <- c(0, cumsum(p)) #this is the format required
i <- unlist(i)
x <- rep(1, length(i))
mat@i <- as.integer(i)
mat@p <- as.integer(p)
mat@x <- x
return(mat)
}
spMat_dgT <- function(ncols, nrows, col_probs){
#Credit to minem - https://codereview.stackexchange.com/a/222190/121860
r <- lapply(1:ncols, function(x) {
p <- col_probs[x]
i <- sample.int(2L, size = nrows, replace = T, prob = c(1 - p, p))
which(i == 2L)
})
rl <- lengths(r)
nc <- rep(1:ncols, times = rl) # col indexes
nr <- unlist(r) # row index
ddims <- c(nrows, ncols)
sparseMatrix(i = nr, j = nc, dims = ddims, giveCsparse = FALSE)
}
答案 0 :(得分:0)
使用dgCMatrix
作为输入,这是一种非常快速的可能解决方案:
new_combine <- function(mat,k){
#Convert dgCMatrix to dgTMatrix
x.T <- as(mat, "dgTMatrix")
#Map column indices to new set of indices
#based on partitioning every k columns
x.T@j <- as.integer(x.T@j %/% k)
#Correct dimensions of new matrix
x.T@Dim <- as.integer(c(nrow(x.T),floor(ncol(mat)/k)))
#Convert back to dgCMatrix
y <- as(x.T,"dgCMatrix")
y
}
microbenchmark::microbenchmark(sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)])),
new_combine(mat2,k),
times=5L)
Unit: milliseconds
expr
sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x + k - 1)]))
new_combine(mat2, k)
min lq mean median uq
1808.872676 1864.783181 1925.17118 1935.98946 1990.28866
8.471521 9.396441 10.99871 10.04459 10.96175
max neval cld
2025.92192 5 b
16.11923 5 a
comp <- sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)]))
comp2 <- new_combine(mat2,k)
> all.equal(comp2,as(comp,"dgCMatrix"))
[1] TRUE