我想在R中有一个高效的框架,用于对大整数的三向张量进行文件支持存储(15000(时间维度)x 500000(第二维)x 500(样本,3d维度)) ,使用行/时间维度增强矩阵(即使用15000 * 500000 x 500矩阵),我需要功能来有效地检索此矩阵的特定部分以进行内存处理,以及在处理后更新特定部分。对于密集矩阵,我可以使用bigmatrix
包,但在我的最终应用中,矩阵是稀疏的(大约99%零),据我所知,bigmatrix
目前不支持稀疏矩阵。有没有人知道我可以在R中使用的任何其他选项? (包ff
和dplyr backed with an on-disk database我理解目前也不支持稀疏矩阵或张量)任何想法?
密集张量/增广矩阵情况的示例代码(但是它还需要对1000个更大的稀疏张量/矩阵起作用)
# example problem size
NRows = 15000 # time dimension
NCols = 500 # 2nd dimension, 1000x larger & sparse in final application
NSamples = 20 # sample dimension, 500 in reality, testing with 20 here
# just filling with a constant integer here, in reality data is read in from netcdf file
# in final application data will be 1000x larger & sparse, with 99% zeros
getsamplematrix = function(r=NRows,c=NCols) matrix(1L, nrow=r, ncol=c)
### 1. Using bigmemory as backend
library(bigmemory)
## step 1: store tensor in row/time dimension augmented bigmemory matrix
putdata = function (NRows, NCols, NSamples) {
data = big.matrix(NRows*NSamples, NCols, type = "integer",
backingfile = "data.bin", descriptorfile = "data.desc",
backingpath = getwd() )
for (i in 1:NSamples) {
data[(1+(i-1)*NRows):(i*NRows), 1:NCols] = getsamplematrix(r = NRows, c = NCols)
}
attr(data, "NRows") = NRows
return(data)
}
system.time(data <- putdata(NRows,NCols,NSamples)) # 23.28 s for 20 matrices
## step 2: get subset of time slices from all samples and store this in 3-way tensor/array S (for in-memory processing)
getsubtensor = function(data, timeindices, cols, samples) {
S = array(dim=c(length(timeindices),length(cols),length(samples))) # preallocate array
nrows = attr(data,"NRows")
for (i in samples) {
S[timeindices,cols,i] = data[((1+(i-1)*nrows):(i*nrows))[timeindices],cols]
}
return(S) }
# example: get time indices 1:100 from all samples
system.time(S <- getsubtensor(data, 1:100, 1:NCols, 1:NSamples)) # 0.04 s
dim(S)
## step 3: update subtensor S at given positions in original disk-mapped data after some processing
updatesubtensor = function(data, S, timeindices, cols, samples) {
nrows = attr(data,"NRows")
for (i in samples) {
data[((1+(i-1)*nrows):(i*nrows))[timeindices],cols] = S[timeindices,cols,i]
}
return(data) }
S2 <- S*2L # example, processing would be done here
system.time(data <- updatesubtensor(data, S2, 1:100, 1:NCols, 1:NSamples)) # 0.17s