Question

在此CodeReview上的post中，我比较了生成大稀疏矩阵的几种方法。具体来说，我使用Matrix中的R包比较了密集结构和稀疏结构。我的问题是关于稀疏结构的后处理。我发现，当我尝试查找每k列的行总和时，密集结构的性能优于稀疏结构。

微基准测试

ncols <- 100000
nrows <- 1000
col_probs <- runif(ncols, 0.001, 0.002)

mat1 <- spMat_dense(ncols=ncols,nrows=nrows,col_probs=col_probs)
mat2 <- spMat_dgC(ncols=ncols,nrows=nrows,col_probs = col_probs)
mat3 <- spMat_dgT(ncols=ncols,nrows=nrows,col_probs=col_probs)

k <- 50
starts <- seq(1, ncols, by=k)
microbenchmark::microbenchmark(sapply(starts, function(x) rowSums(mat1[, x:(x+k-1)])),
                               sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)])),
                               sapply(starts, function(x) Matrix::rowSums(mat3[, x:(x+k-1)])),
                               times=5L)

Unit: milliseconds
                                                                              expr
         sapply(starts, function(x) rowSums(mat1[, x:(x + k -      1)]))
 sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x + k -      1)]))
 sapply(starts, function(x) Matrix::rowSums(mat3[, x:(x + k -      1)]))
        min         lq      mean     median        uq       max
   912.0453   947.0454  1041.365   965.4375  1007.311  1374.988
  2097.4125  2208.0056  2566.575  2406.8450  2851.640  3268.970
 13231.4790 13619.3818 13819.745 13675.6282 13923.803 14648.434
 neval cld
     5 a  
     5  b 
     5   c

我的猜测是sapply函数在稠密矩阵上效果更好，因为它不需要在后台进行稀疏到稠密的转换。这些功能在下面发布。

问题有没有一种方法可以提高上述稀疏结构的后处理速度？

功能

spMat_dense <- function(ncols,nrows,col_probs){
  matrix(rbinom(nrows*ncols,1,col_probs),
         ncol=ncols,byrow=T)
}

library(Matrix)
spMat_dgC <- function(ncols,nrows,col_probs){
  #Credit to Andrew Guster (https://stackoverflow.com/a/56348978/4321711)
  mat <- Matrix(0, nrows, ncols, sparse = TRUE)  #blank matrix for template
  i <- vector(mode = "list", length = ncols)     #each element of i contains the '1' rows
  p <- rep(0, ncols)                             #p will be cumsum no of 1s by column
  for(r in 1:nrows){
    row <- rbinom(ncols, 1, col_probs)            #random row
    p <- p + row                                 #add to column identifier
    if(any(row == 1)){
      for (j in which(row == 1)){
        i[[j]] <- c(i[[j]], r-1)                 #append row identifier
      }
    }
  }
  p <- c(0, cumsum(p))                           #this is the format required
  i <- unlist(i)
  x <- rep(1, length(i))
  mat@i <- as.integer(i)
  mat@p <- as.integer(p)
  mat@x <- x
  return(mat)
}

spMat_dgT <- function(ncols, nrows, col_probs){
  #Credit to minem - https://codereview.stackexchange.com/a/222190/121860
  r <- lapply(1:ncols, function(x) {
    p <- col_probs[x]
    i <- sample.int(2L, size = nrows, replace = T, prob = c(1 - p, p))
    which(i == 2L)
  })
  rl <- lengths(r)
  nc <- rep(1:ncols, times = rl) # col indexes
  nr <- unlist(r) # row index
  ddims <- c(nrows, ncols)
  sparseMatrix(i = nr, j = nc, dims = ddims, giveCsparse = FALSE)
}

Answer 1

使用dgCMatrix作为输入，这是一种非常快速的可能解决方案：

new_combine <- function(mat,k){
  #Convert dgCMatrix to dgTMatrix
  x.T <- as(mat, "dgTMatrix") 
  #Map column indices to new set of indices 
  #based on partitioning every k columns
  x.T@j <- as.integer(x.T@j %/% k)
  #Correct dimensions of new matrix
  x.T@Dim <- as.integer(c(nrow(x.T),floor(ncol(mat)/k)))
  #Convert back to dgCMatrix
  y <- as(x.T,"dgCMatrix")
  y
}

microbenchmark::microbenchmark(sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)])),
                               new_combine(mat2,k),
                               times=5L)

Unit: milliseconds
                                                                    expr
 sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x + k -      1)]))
                                                            new_combine(mat2, k)
         min          lq       mean     median         uq
 1808.872676 1864.783181 1925.17118 1935.98946 1990.28866
    8.471521    9.396441   10.99871   10.04459   10.96175
        max neval cld
 2025.92192     5   b
   16.11923     5  a


comp <- sapply(starts, function(x) Matrix::rowSums(mat2[, x:(x+k-1)]))
comp2 <- new_combine(mat2,k)

> all.equal(comp2,as(comp,"dgCMatrix"))
[1] TRUE

对大型稀疏矩阵的每一行中的每k列求和的有效方法

1 个答案: