%dopar%foreach循环内写入CSV的安全方法

时间:2020-08-24 17:08:56

标签: r foreach doparallel

[编辑]

这是一个普遍的问题:我看到一些帖子说在foreach循环中使用foreach和write.csv不是一个好主意,因为不同的内核试图同时写入文件,从而导致缺少结果。尽管如此,我仍需要在并行循环中写入外部文件以获取输出(500000+行和10+列)。否则,它会因内存问题而崩溃。因此,我想知道在foreach循环中是否存在更安全的方法来写入结果文件。 我对此表示感谢

我要添加的信息比实际的要多,代码和数据也要简单得多。

说明:我有两个不同的多边形图层(sf,多边形),每个图层具有500000+ sf。我需要计算每个多边形中不同栅格类别(1个栅格图层和3个类别)的面积。这是脚本中最耗时的部分,特别是因为我需要多次使用sf :: sf_intersection。然后,我使用if-else和规则的许多不同组合来用值和规则填充df。

这是原始代码,我在处理原始数据时遇到内存问题:

require(sf)
require(raster)
require(rgdal)
require(rgeos)
require(dplyr)
require(stars)


## Sample data
set.seed(131)

sample_raster = raster(nrows = 1, ncols = 1, res = 0.5, xmn = 0, xmx = 11, ymn = 0, ymx = 11)
values(sample_raster) = rep(1:3, length.out = ncell(sample_raster))
crs(sample_raster) = CRS('+init=EPSG:4326')
plot(sample_raster, axes=T)
sample_raster

##

m = rbind(c(0,0), c(1,0), c(1,1), c(0,1), c(0,0))
p = st_polygon(list(m))
n = 100
l = vector("list", n)
for (i in 1:n)
  l[[i]] = p + 10 * runif(2)
sample_poly = st_sfc(l)


data = data.frame(PR_ID = seq(1:100),
       COND1 = rep(1:10, length.out = 100))
sample_poly = st_sf(cbind(data, sample_poly))


plot(sample_poly, col = sf.colors(categorical = TRUE, alpha = .5), add=T)
sample_poly = sample_poly %>% st_set_crs(4326)
sample_poly

## 





## Code
require(parallel)
require(foreach)  
require(doParallel) 

idall = as.character(sample_poly$PR_ID)
area = as.numeric(st_area(sample_poly))/10000


# i=1
# listID = idall
# mainpoly = sample_poly
# mainras = sample_raster
# mainpolyarea = area


  
per.imovel.paralallel = function (listID, mainpoly, mainras, mainpolyarea) { # Starting the function
  
  ## Setting the parallel work up into your computer
  UseCores = detectCores()-1 
  cl  = parallel::makeCluster(UseCores, output="") 
  doParallel::registerDoParallel(cl)
  

  writeLines(c(""), "log.txt") # Creates a LOG FILE in the folder to follow processing 
  
  
  FOREACH.RESULT = foreach(i = 1:length(listID), .packages=c('raster', 'rgdal', 'rgeos', 'dplyr', 'parallel', 
                                                       'doParallel', 'sf', 'stars'), .inorder = T , .combine ='rbind') %dopar% 
  { # Stating the paral-loop
      
      
      sink("log.txt", append=TRUE)  # LOG FILE in the home folder
      cat(paste(i, "of", length(listID), as.character(Sys.time()),"\n")) # Write to LOG FILE
      sink() # end diversion of output
      
      
      ########################
      ### Pick one poly
      px = sf::st_buffer(mainpoly[mainpoly$PR_ID == listID[i],], # Conditional to select the geometry PR_ID in position i
                              dist = 0.1)                             # buffer = 0 w/ byid, selects the geometry 
      
      
      ########################
      ### Intersect with raster and get area
      px2 = sf::st_buffer(px, dist = 0.1) # Buffer because raster::mask() masks out partially covered cells since it call rasterize() first
      desm_prop = raster::crop(mainras, as_Spatial(px2))
      
      
      
      desm_prop_shp = if(all(is.na(values(desm_prop)))){NULL
      } else {sf::st_intersection(st_cast(sf::st_as_sf(stars::st_as_stars(desm_prop)), "POLYGON"), px)} 
      
      names(desm_prop_shp)[1] = if(any(names(desm_prop_shp) == "layer")){"values"
      } else {NULL}
      
      
      
      desm_prop_bet0108 = if(is.null(desm_prop_shp)){NULL
      } else {desm_prop_shp[desm_prop_shp$values == 1, ]} 
      
      desm_prop_bet0108 = if(is.null(desm_prop_bet0108) | length(desm_prop_bet0108) == 0){NULL
      } else if(length(desm_prop_bet0108$values) == 0){NULL
      } else {desm_prop_bet0108}
      
      
      desm_prop_after08 = if(is.null(desm_prop_shp)){NULL
      } else {desm_prop_shp[desm_prop_shp$values == 2, ]}
      
      desm_prop_after08 = if(is.null(desm_prop_after08) | length(desm_prop_after08) == 0){NULL
      } else if(length(desm_prop_after08$values) == 0){NULL
      } else {desm_prop_after08}
      
      
      desm_prop_upto00 = if(is.null(desm_prop_shp)){NULL
      } else {desm_prop_shp[desm_prop_shp$values == 3, ]}
      
      desm_prop_upto00 = if(is.null(desm_prop_upto00) | length(desm_prop_upto00) == 0){NULL
      } else if(length(desm_prop_upto00$values) == 0){NULL
      } else {desm_prop_upto00}
      
      
      
      area_desm_prop_bet0108 <- if(is.null(desm_prop_bet0108)){0 
      } else { sum(as.numeric(sf::st_area(desm_prop_bet0108)/10000))} # Deforestation area in PX 2001 - 2008
      
      area_desm_prop_after08 <- if(is.null(desm_prop_after08)){0  
      } else { sum(as.numeric(sf::st_area(desm_prop_after08)/10000))} # Deforestation area in PX after 2008
      
      area_desm_prop_upto00 <- if(is.null(desm_prop_upto00)){0 
      } else { sum(as.numeric(sf::st_area(desm_prop_upto00)/10000))} # Deforestation area in PX upto 2000
      
      ########################
      # RESULTS
      TEMP.RESULTS = data.frame(PR_ID = as.character(listID[i]),
                                PR_AREA_HA = mainpolyarea[i],
                                
                                PR_D09 = area_desm_prop_after08,
                                PR_D0108 = area_desm_prop_bet0108,
                                PR_D00 = area_desm_prop_upto00)
      
      
      return (TEMP.RESULTS)
      
    } # Ending the loop
  

return (FOREACH.RESULT) 
parallel::stopCluster(cl) # stop cluster
stopImplicitCluster() # stop cluster
gc()
  
} # Ending the function


#####################################################################################################

results_feach = per.imovel.paralallel (listID = idall, mainpoly = sample_poly, mainras = sample_raster, mainpolyarea = area)
warnings()

我还通过添加以下内容来尝试@ mischva11(修改)建议:


length_of_chunk = round(length(idall)/(length(idall)/10)) # generate chunks of 10 lines
lchunks = split(idall, sort(rep_len(1:length_of_chunk, length(idall))))

for (z in 1:length_of_chunk){
  
  # split up the data in chunks
  idall_chunk = as.vector(unlist(lchunks[z]))
  results_chunk = per.imovel.paralallel (listID = idall_chunk, mainpoly = sample_poly, mainras = sample_raster, mainpolyarea = area)
  
  # save your foreach results for each chunk, append after the first one
  if (z == 1) {write.table(results_chunk, file = "TESTDATAresults1.csv")
    }else {write.table(results_chunk, file = "TESTDATAresults1.csv", append = TRUE, col.names = FALSE)}
  
  print(NULL) # print(results_chunk)
}

在此示例中,它像一个符咒。

但是,使用真实的脚本/数据运行它时,我会遇到挫折:foreach关闭需要花费一些时间。我正在查看机器性能和日志文件。.处理完sf对象的所有行之后,我的CPU工作异常正常,但是仍然需要30分钟(我没有等待它完全完成)才能关闭foreach功能。 因此,我考虑过将输出写在foreach工作内部的流上。但显然,这不是一个好主意,如此处所述。我已经看到了一些有关“ flock”软件包的帖子,这些帖子看起来是用于写入输出的输出文件。我尚未测试,但这听起来很有希望。

1 个答案:

答案 0 :(得分:0)

这里的问题是,您需要内核之间的通信。一个核心必须等待下一个核心才能完成在csv中的编写。就目前而言,使用foreach做到这一点并不容易,而且不可能。 foreach确实为此方法提供了变量inorder(默认为true)。您告诉我们,您遇到了内存问题。因此,一种解决方案是在可能的情况下分块输出。对于此示例,我没有一个很好的数据集,因此我使用的是mtcars,它将被NA s

填充
library(foreach)
library(parallel)
library(doParallel)


registerDoParallel(4)

# split your output here, I use 5 chunks here. My data is mtcars */
length_of_chunk <-round(nrow(mtcars)/5)
for ( z in 1:length_of_chunk-1){
  x<-0
  #here the data gets split up
  data <- mtcars[(z*length_of_chunk):(z*length_of_chunk+length_of_chunk),]
  #foreach with those 5 datarows
  results <- foreach(i=1:length_of_chunk, .combine=rbind) %dopar% {
    #***your code***
    y = data[i,]
    return(y)
  }
  print(results)
  # save your foreach results and then begin again
  if (z==1) {write.table(results, file= "test.csv")}
  else {write.table(results, file="test.csv", append=TRUE, col.names = FALSE)}
}