加速R中的矩阵行和列操作

时间:2016-12-28 01:46:29

标签: r performance matrix parallel-processing

我有一个积极的大矩阵:

set.seed(1)
mat <- matrix(abs(rnorm(130000*1000)),nrow=130000,ncol=1000)
rownames(mat) <- paste("r",1:nrow(mat),sep="")

rownames matparent.id

相关联
row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(13000,130000,replace=T)),sep=""))

这样每隔几行就会与同一个parent.id相关联。

我需要为row中的每个mat计算这些操作:

  1. mean行元素的log

  2. mean具有相同parent.id的所有行中该行的一部分

  3. mean具有相同parent.id的所有行中该行比例的概率

  4. sd具有相同parent.id的所有行中该行比例的概率

  5. 自然,这是第一个想到的解决方案:

    require(VGAM)
    res.df <- do.call(rbind,mclapply(1:nrow(mat), function(x) {
      idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
      data.frame(mean.log=mean(log(mat[x,])),
                 mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
                 mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
                 sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
    }))
    

    但我想知道是否有任何方法可以更快地实现这一目标。

    P.S。

    我不认为使用data.table代替矩阵是可行的方法:

    require(data.table)
    require(microbenchmark)
    require(VGAM)
    
    set.seed(1)
    mat <- data.table(matrix(abs(rnorm(13*5)),nrow=13,ncol=5))
    rownames(mat) <- paste("r",1:nrow(mat),sep="")
    row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))
    
    microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
      idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
      data.frame(mean.log=mean(as.numeric(log(mat[x,]))),
                 mean.proportion=mean(as.numeric(mat[x,])/apply(mat[idx,],2,sum)),
                 mean.probit=mean(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))),
                 sd.probit=sd(probit(as.numeric(mat[x,])/apply(mat[idx,],2,sum))))
    })))
    
    
    expr
     df <- do.call(rbind, lapply(1:nrow(mat), function(x) {     idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id ==          row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])     data.frame(mean.log = mean(as.numeric(log(mat[x, ]))), mean.proportion = mean(as.numeric(mat[x,          ])/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(as.numeric(mat[x,          ])/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(as.numeric(mat[x,          ])/apply(mat[idx, ], 2, sum)))) }))
          min       lq     mean   median       uq     max neval
     65.08929 66.49415 69.78937 67.70534 70.38044 206.017   100
    >
    

    与:相比:

    set.seed(1)
    mat <- matrix(abs(rnorm(13*5)),nrow=13,ncol=5)
    rownames(mat) <- paste("r",1:nrow(mat),sep="")
    row.ids.df <- data.frame(row.id=rownames(mat),parent.id=paste("p",sort(sample(2,13,replace=T)),sep=""))
    
    require(VGAM)
    microbenchmark(df <- do.call(rbind,lapply(1:nrow(mat), function(x) {
      idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id == row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])
      data.frame(mean.log=mean(log(mat[x,])),
                 mean.proportion=mean(mat[x,]/apply(mat[idx,],2,sum)),
                 mean.probit=mean(probit(mat[x,]/apply(mat[idx,],2,sum))),
                 sd.probit=sd(probit(mat[x,]/apply(mat[idx,],2,sum))))
    })))
    
    
    Unit: milliseconds
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                expr
     df <- do.call(rbind, lapply(1:nrow(mat), function(x) {     idx <- which(rownames(mat) %in% row.ids.df$row.id[which(row.ids.df$parent.id ==          row.ids.df$parent.id[which(row.ids.df$row.id == rownames(mat)[x])])])     data.frame(mean.log = mean(log(mat[x, ])), mean.proportion = mean(mat[x,          ]/apply(mat[idx, ], 2, sum)), mean.probit = mean(probit(mat[x,          ]/apply(mat[idx, ], 2, sum))), sd.probit = sd(probit(mat[x,          ]/apply(mat[idx, ], 2, sum)))) }))
          min      lq     mean median       uq      max neval
     10.15047 10.2894 10.69573 10.428 10.69741 14.56724   100
    

    除非每次我想在as.numeric行上运行操作时都应用data.table,这是个坏主意。

1 个答案:

答案 0 :(得分:1)

  

我不认为使用data.table而不是矩阵是要走的路

显然,你必须实际使用data.table。它不是一个神奇的魔杖,可以在不花费精力的情况下优化代码。您需要使用data.table语法。

  

我需要为mat中的每一行计算这些操作:

mean of log of the row elements

mean proportion of of that row out of all rows with the same parent.id

mean probit of the proportion of of that row out of all rows with the same parent.id

sd probit of the proportion of of that row out of all rows with the same parent.id

我认为这可能会满足您的需求:

library(data.table)
DT <- data.table(row.ids.df, mat)
DT <- melt(DT, id.vars = c("row.id", "parent.id"))

DT[, proportion := value / sum(value), by = .(variable, parent.id)]

res <- DT[, .(
  mean.log = mean(log(value)),
  mean.proportion = mean(proportion),
  mean.probit = mean(probit(proportion)),
  sd.probit = sd(probit(proportion))), by = row.id]

all.equal(res[["sd.probit"]], 
          res.df[["sd.probit"]])
#[1] TRUE
#(Tested with 100 rows and 30 columns.)

我希望它更有效率,但它更具可读性。