光栅化时光栅包占用内存

时间:2021-01-04 11:42:50

标签: r memory raster

我在 R 服务器上使用光栅包来处理大量数据文件(30000 个文件)(每个 10MB)。

目前,处理包括解析数据并随后通过 rasterize 函数对其进行光栅化。 数据非常稀疏(仅沿道路),但具有高分辨率和大范围。我见过 30GB 的临时文件,用于从输入文件之一创建的栅格。

由于文件数量的原因,我使用 foreach() %dopar% 方法处理文件,为每个线程提供一个文件。我已将栅格选项设置如下:

rasterOptions(maxmemory = 15000000000)
rasterOptions(chunksize = 14000000000)
rasterOptions(todisk = TRUE)

这应该是 15GB/线程 * 32 个线程 = 480GB 的 RAM 最大用于栅格。增加一些开销,我希望 512GB RAM 中的 10GB 到 20GB 之间会保留。然而,事实并非如此,我似乎无法弄清楚为什么。 R 吞噬 RAM,直到只剩下 100MB 到 2GB,然后似乎才释放先前分配的内存,只是为了下一个光栅直接反馈到 R。我在几个小时内反复检查 RAM 使用情况以观察这一点。

我使用 SpatialPolygonDataFrames 作为 rasterize 的输入,并怀疑它们可能也会占用大量 RAM。但是当我检查它们的大小时,它们相当小,大约 100MB。玩弄 maxmemorychunksize 和只有 16 个线程似乎也没有任何效果。 我还查看了 rasterize source code 以查看是否在那里找到了解释,但这并没有让我走得更远:

setMethod('rasterize', signature(x='SpatialPoints', y='Raster'), 
    function(x, y, field, fun='last', background=NA, mask=FALSE, update=FALSE, updateValue='all', filename="", na.rm=TRUE, ...){ 
    
        .pointsToRaster(x, y, field=field, fun=fun, background=background, mask=mask, update=update, updateValue=updateValue, filename=filename, na.rm=na.rm, ...)
    
    }
)

我不知道在哪里可以找到 .pointsToRaster

有没有人对这种行为有任何解释或有什么要检查的事情的想法?我只是忽略了什么吗?我想使用整个 RAM,以便其他用户仍然可以在服务器上工作。据我了解,我的代码应该控制使用了多少 RAM。

这是我使用的代码:

library('iterators')
library('parallel')
library('foreach')
library('doParallel')

#init parallelisation
nCores = 32
cCluster = makeCluster(nCores, type = "FORK", outFile = "parseProcess")
registerDoParallel(cCluster)


foreach(j = 1:length(fileList)) %dopar%{
    #load all libraries for every thread
    library('sp')
    library('raster')
    library('spatial')
    library('gstat')
    library('rgdal')
    library('dismo')
    library('deldir')
    library('rgeos')
    library('sjmisc')
    #set rasteroptions per thread
    rasterOptions(maxmemory = 15000000000)
    rasterOptions(chunksize = 14000000000)
    rasterOptions(todisk = TRUE)
    tmpFolder = paste0("[PATH TO STORAGE]/rtmp",j)
    dir.create(tmpFolder)
    rasterOptions(tmpdir = tmpFolder)
  
    #generate names for raster files
    fileName = basename(fileList[j])
    print(paste("Processing:", fileName))
    rNameMax0 = sub(pattern = ".bin", replacement = "_scan0_max.tif", fileName)
    #repeat this for all 11 scans
  
    rasterStorage = "[PATH TO OTHER STORAGE]" #path to raster folder 
      
    scanList = parseFile(fileList[j]) #any memory allocated in this functions should be released on function return

    #create template raster
    bounds = as.vector(t(bbox(scanList$scan0)))
    resolution = c(0.0000566, 0.0000359)
    tmp = raster(xmn = bounds[1], xmx = bounds[2], ymn = bounds[3], ymx = bounds[4], res = resolution)
    
    #create rasters from data
        
    coordinates(scanList$scan0) = ~Long+Lat
    proj4string(scanList$scan0) = WGS84CRS
    rScanMax0 = rasterize(scanList$scan0, tmp, fun = 'max', filename = paste0(rasterStorage, rNameMax0))
    rm('rScanMax0')
        
    #repeat for scans 1 to 4
        
    removeTmpFiles(h = 0.2)
    unlink(tmpFolder, recursive = TRUE, force = TRUE)
    dir.create(tmpFolder)
    rasterOptions(tmpdir = tmpFolder)
        
    coordinates(scanList$scan5) = ~Long+Lat
    proj4string(scanList$scan5) = WGS84CRS
    rScanMax5 = rasterize(scanList$scan5, tmp, fun = 'max', filename = paste0(rasterStorage, rNameMax5))
    rm('rScanMax5')

    #repeat for scans 6 to 10
    
    removeTmpFiles(h = 0.2)
    unlink(tmpFolder, recursive = TRUE, force = TRUE)
}

stopCluster(cCluster)

这是 parseFile 函数的(内脏)代码:

parseFile = function(fileName){

con = file(fileName, "rb")
    intSize = 4
    fileEndian = "little"
    
    #create data frames for each scan
    scan0 = data.frame(matrix(ncol = n1, nrow = 0))
    colnames(scan0) = c("Lat", "Long", ...)
    scan1 = data.frame(matrix(ncol = n2, nrow = 0))
    colnames(scan1) = c("Lat", "Long", ...)
    scan2 = data.frame(matrix(ncol = n3, nrow = 0))
    colnames(scan2) = c("Lat", "Long", ...)
    scan3 = data.frame(matrix(ncol = n4, nrow = 0))
    colnames(scan3) = c("Lat", "Long", ...)
    scan4 = data.frame(matrix(ncol = n5, nrow = 0))
    colnames(scan4) = c("Lat", "Long", ...)
    scan5 = data.frame(matrix(ncol = n6, nrow = 0))
    colnames(scan5) = c("Lat", "Long", ...)
    scan6 = data.frame(matrix(ncol = n7, nrow = 0))
    colnames(scan6) = c("Lat", "Long", ...)
    scan7 = data.frame(matrix(ncol = n8, nrow = 0))
    colnames(scan7) = c("Lat", "Long", ...)
    scan8 = data.frame(matrix(ncol = n9, nrow = 0))
    colnames(scan8) = c("Lat", "Long", ...)
    scan9 = data.frame(matrix(ncol = n10, nrow = 0))
    colnames(scan9) = c("Lat", "Long", ...)
    scan10 = data.frame(matrix(ncol = n11, nrow = 0))
    colnames(scan10) = c("Lat", "Long", ...)
    
    header = readBin(con, raw(), n = 36)
    
    i = 1
    
    while(i){
        blockHeader = readBin(con, integer(), n = 3, size = intSize, endian = fileEndian) 
        if(...){ #check whether file ended
            break
        }
        i = i + 1
        
        #sort data to correct scan, assign GPS tag 

        blockTrailer = readBin(con, raw(), n = 8)
    }
    #clean up
    close(con)
    
    #return parsed data
    returnList = list("scan0" = scan0, "scan1" = scan1, "scan2" = scan2, "scan3" = scan3, "scan4" = scan4, 
            "scan5" = scan5, "scan6" = scan6, "scan7" = scan7, "scan8" = scan8, "scan9" = scan9, "scan10" = scan10)
    return(returnList)
}

我也在查看发布于 here 的解决方案作为另一种方法,但我仍然想知道为什么我的代码没有像我期望的那样工作。

0 个答案:

没有答案