我在 R 服务器上使用光栅包来处理大量数据文件(30000 个文件)(每个 10MB)。
目前,处理包括解析数据并随后通过 rasterize
函数对其进行光栅化。
数据非常稀疏(仅沿道路),但具有高分辨率和大范围。我见过 30GB 的临时文件,用于从输入文件之一创建的栅格。
由于文件数量的原因,我使用 foreach() %dopar%
方法处理文件,为每个线程提供一个文件。我已将栅格选项设置如下:
rasterOptions(maxmemory = 15000000000)
rasterOptions(chunksize = 14000000000)
rasterOptions(todisk = TRUE)
这应该是 15GB/线程 * 32 个线程 = 480GB 的 RAM 最大用于栅格。增加一些开销,我希望 512GB RAM 中的 10GB 到 20GB 之间会保留。然而,事实并非如此,我似乎无法弄清楚为什么。 R 吞噬 RAM,直到只剩下 100MB 到 2GB,然后似乎才释放先前分配的内存,只是为了下一个光栅直接反馈到 R。我在几个小时内反复检查 RAM 使用情况以观察这一点。
我使用 SpatialPolygonDataFrames
作为 rasterize
的输入,并怀疑它们可能也会占用大量 RAM。但是当我检查它们的大小时,它们相当小,大约 100MB。玩弄 maxmemory
、chunksize
和只有 16 个线程似乎也没有任何效果。
我还查看了 rasterize
source code 以查看是否在那里找到了解释,但这并没有让我走得更远:
setMethod('rasterize', signature(x='SpatialPoints', y='Raster'),
function(x, y, field, fun='last', background=NA, mask=FALSE, update=FALSE, updateValue='all', filename="", na.rm=TRUE, ...){
.pointsToRaster(x, y, field=field, fun=fun, background=background, mask=mask, update=update, updateValue=updateValue, filename=filename, na.rm=na.rm, ...)
}
)
我不知道在哪里可以找到 .pointsToRaster
有没有人对这种行为有任何解释或有什么要检查的事情的想法?我只是忽略了什么吗?我想不使用整个 RAM,以便其他用户仍然可以在服务器上工作。据我了解,我的代码应该控制使用了多少 RAM。
这是我使用的代码:
library('iterators')
library('parallel')
library('foreach')
library('doParallel')
#init parallelisation
nCores = 32
cCluster = makeCluster(nCores, type = "FORK", outFile = "parseProcess")
registerDoParallel(cCluster)
foreach(j = 1:length(fileList)) %dopar%{
#load all libraries for every thread
library('sp')
library('raster')
library('spatial')
library('gstat')
library('rgdal')
library('dismo')
library('deldir')
library('rgeos')
library('sjmisc')
#set rasteroptions per thread
rasterOptions(maxmemory = 15000000000)
rasterOptions(chunksize = 14000000000)
rasterOptions(todisk = TRUE)
tmpFolder = paste0("[PATH TO STORAGE]/rtmp",j)
dir.create(tmpFolder)
rasterOptions(tmpdir = tmpFolder)
#generate names for raster files
fileName = basename(fileList[j])
print(paste("Processing:", fileName))
rNameMax0 = sub(pattern = ".bin", replacement = "_scan0_max.tif", fileName)
#repeat this for all 11 scans
rasterStorage = "[PATH TO OTHER STORAGE]" #path to raster folder
scanList = parseFile(fileList[j]) #any memory allocated in this functions should be released on function return
#create template raster
bounds = as.vector(t(bbox(scanList$scan0)))
resolution = c(0.0000566, 0.0000359)
tmp = raster(xmn = bounds[1], xmx = bounds[2], ymn = bounds[3], ymx = bounds[4], res = resolution)
#create rasters from data
coordinates(scanList$scan0) = ~Long+Lat
proj4string(scanList$scan0) = WGS84CRS
rScanMax0 = rasterize(scanList$scan0, tmp, fun = 'max', filename = paste0(rasterStorage, rNameMax0))
rm('rScanMax0')
#repeat for scans 1 to 4
removeTmpFiles(h = 0.2)
unlink(tmpFolder, recursive = TRUE, force = TRUE)
dir.create(tmpFolder)
rasterOptions(tmpdir = tmpFolder)
coordinates(scanList$scan5) = ~Long+Lat
proj4string(scanList$scan5) = WGS84CRS
rScanMax5 = rasterize(scanList$scan5, tmp, fun = 'max', filename = paste0(rasterStorage, rNameMax5))
rm('rScanMax5')
#repeat for scans 6 to 10
removeTmpFiles(h = 0.2)
unlink(tmpFolder, recursive = TRUE, force = TRUE)
}
stopCluster(cCluster)
这是 parseFile 函数的(内脏)代码:
parseFile = function(fileName){
con = file(fileName, "rb")
intSize = 4
fileEndian = "little"
#create data frames for each scan
scan0 = data.frame(matrix(ncol = n1, nrow = 0))
colnames(scan0) = c("Lat", "Long", ...)
scan1 = data.frame(matrix(ncol = n2, nrow = 0))
colnames(scan1) = c("Lat", "Long", ...)
scan2 = data.frame(matrix(ncol = n3, nrow = 0))
colnames(scan2) = c("Lat", "Long", ...)
scan3 = data.frame(matrix(ncol = n4, nrow = 0))
colnames(scan3) = c("Lat", "Long", ...)
scan4 = data.frame(matrix(ncol = n5, nrow = 0))
colnames(scan4) = c("Lat", "Long", ...)
scan5 = data.frame(matrix(ncol = n6, nrow = 0))
colnames(scan5) = c("Lat", "Long", ...)
scan6 = data.frame(matrix(ncol = n7, nrow = 0))
colnames(scan6) = c("Lat", "Long", ...)
scan7 = data.frame(matrix(ncol = n8, nrow = 0))
colnames(scan7) = c("Lat", "Long", ...)
scan8 = data.frame(matrix(ncol = n9, nrow = 0))
colnames(scan8) = c("Lat", "Long", ...)
scan9 = data.frame(matrix(ncol = n10, nrow = 0))
colnames(scan9) = c("Lat", "Long", ...)
scan10 = data.frame(matrix(ncol = n11, nrow = 0))
colnames(scan10) = c("Lat", "Long", ...)
header = readBin(con, raw(), n = 36)
i = 1
while(i){
blockHeader = readBin(con, integer(), n = 3, size = intSize, endian = fileEndian)
if(...){ #check whether file ended
break
}
i = i + 1
#sort data to correct scan, assign GPS tag
blockTrailer = readBin(con, raw(), n = 8)
}
#clean up
close(con)
#return parsed data
returnList = list("scan0" = scan0, "scan1" = scan1, "scan2" = scan2, "scan3" = scan3, "scan4" = scan4,
"scan5" = scan5, "scan6" = scan6, "scan7" = scan7, "scan8" = scan8, "scan9" = scan9, "scan10" = scan10)
return(returnList)
}
我也在查看发布于 here 的解决方案作为另一种方法,但我仍然想知道为什么我的代码没有像我期望的那样工作。