Question

我正在尝试使用random forest针对数据集的不同部分为2种算法（gbm和h2o）运行优化网格。我的代码看起来像

for (...)
{
        read data

        # setup h2o cluster
        h2o <- h2o.init(ip = "localhost", port = 54321, nthreads = detectCores()-1)

        gbm.grid <- h2o.grid("gbm", grid_id = "gbm.grid", x = names(td.train.h2o)[!names(td.train.h2o)%like%segment_binary], y = segment_binary, 
                             seed = 42, distribution = "bernoulli",
                             training_frame = td.train.h2o, validation_frame = td.train.hyper.h2o,
                             hyper_params = hyper_params, search_criteria = search_criteria)

    # shutdown h2o
    h2o.shutdown(prompt = FALSE)

    # setup h2o cluster
    h2o <- h2o.init(ip = "localhost", port = 54321, nthreads = detectCores()-1)

    rf.grid <- h2o.grid("randomForest", grid_id = "rf.grid", x = names(td.train.h2o)[!names(td.train.h2o)%like%segment_binary], y = segment_binary, 
                        seed = 42, distribution = "bernoulli",
                        training_frame = td.train.h2o, validation_frame = td.train.hyper.h2o,
                        hyper_params = hyper_params, search_criteria = search_criteria)

    h2o.shutdown(prompt = FALSE)
}

问题是，如果我一次运行for loop，我会收到错误

Error in .h2o.doSafeREST(h2oRestApiVersion = h2oRestApiVersion, urlSuffix = urlSuffix,  : 
  Unexpected CURL error: Failed to connect to localhost port 54321: Connection refused

P.S。：我正在使用

行

 # shutdown h2o
h2o.shutdown(prompt = FALSE)

# setup h2o cluster
h2o <- h2o.init(ip = "localhost", port = 54321, nthreads = detectCores()-1)

所以我＆＃34;重置＆＃34; h2o，以便我不会耗尽内存

我也读过R H2O - Memory management，但我不清楚它是如何运作的。

更新

在关注Matteusz评论之后，我init以外的for loop以及for loop内部使用h2o.removeAll()。所以现在我的代码看起来像这样

 h2o <- h2o.init(ip = "localhost", port = 54321, nthreads = detectCores()-1)
for(...)
{
read data

gbm.grid <- h2o.grid("gbm", grid_id = "gbm.grid", x = names(td.train.h2o)[!names(td.train.h2o)%like%segment_binary], y = segment_binary, 
                             seed = 42, distribution = "bernoulli",
                             training_frame = td.train.h2o, validation_frame = td.train.hyper.h2o,
                             hyper_params = hyper_params, search_criteria = search_criteria)

h2o.removeAll()

rf.grid <- h2o.grid("randomForest", grid_id = "rf.grid", x = names(td.train.h2o)[!names(td.train.h2o)%like%segment_binary], y = segment_binary, 
                        seed = 42, distribution = "bernoulli",
                        training_frame = td.train.h2o, validation_frame = td.train.hyper.h2o,
                        hyper_params = hyper_params, search_criteria = search_criteria)

h2o.removeAll() }

它似乎有效，但现在我在grid optimization random forest

中收到此错误（？）

任何想法可能是什么？

Answer 1

这似乎非常浪费，每次迭代都会启动两次h2o。如果您只想释放内存，可以使用h2o.removeAll()代替。

至于原因，h2o.shutdown()（任何H2O关闭）不是同步操作，并且在函数返回后仍然会发生一些清理（例如处理未完成的请求）。在使用h2o.clusterIsUp()重新启动群集之前，您可以使用init检查群集是否实际已关闭。

Answer 2

错误的原因是您没有更改循环中的# obtain data dl_from_dropbox <- function(x, key) { require(RCurl) bin <- getBinaryURL(paste0("https://dl.dropboxusercontent.com/s/", key, "/", x), ssl.verifypeer = FALSE) con <- file(x, open = "wb") writeBin(bin, con) close(con) message(noquote(paste(x, "read into", getwd()))) } dl_from_dropbox("Reproducible_example.csv", "4z97tlkfedmutqr") shell.exec("Reproducible_example.csv") # read data data<-read.csv("Reproducible_example.csv") # run models library(lme4) glmer_offset_no_date<- glmer(Total_abundance ~ Habitat + (1|Loc/Plot) + offset(log(Sampling_effort)), data = data, family = poisson(link = "log")) glmer_no_offset_no_date<- glmer(Total_abundance ~ Habitat + (1|Loc/Plot) , data = data, family = poisson(link = "log")) glmer_offset_date<- glmer(Total_abundance ~ Habitat + (1|Loc/Plot) + (1|Date) + offset(log(Sampling_effort)), data = data, family = poisson(link = "log")) glmer_no_offset_date<- glmer(Total_abundance ~ Habitat + (1|Loc/Plot) + (1|Date), data = data, family = poisson(link = "log")) AIC(glmer_no_offset,glmer_offset,glmer_no_offset_date,glmer_offset_date) # or take mean abundance per plot and run the model with/without an offset # # tidy data library(plyr) Mean_abundance_per_plot<-ddply(data, c("Plot", "Loc", "Habitat", "latitude", "longitude"), colwise(mean)) Mean_abundance_per_plot<-Mean_abundance_per_plot[,-6] library(dplyr) Mean_abundance_per_plot_rounded<-Mean_abundance_per_plot %>% mutate_each(funs(round(.,0)), -c(Loc, Plot, Habitat,latitude, longitude)) # run models glmer_avg_offset<- glmer(Total_abundance ~ Habitat + (1|Loc/Plot) + offset(log(Sampling_effort)), data = Mean_abundance_per_plot_rounded, family = poisson(link = "log")) glmer_avg_no_offset<- glmer(Total_abundance ~ Habitat + (1|Loc/Plot), data = Mean_abundance_per_plot_rounded, family = poisson(link = "log")) # fails to converge参数。我的建议是让H2O通过保留未指定/ NULL来自动生成网格ID。您也可以手动创建不同的网格ID（每个数据集一个），但这不是必需的。

当您使用相同的训练集时，您只能将新模型添加到现有网格（通过重复使用相同的网格ID）。当您在不同数据集的for循环中放置网格搜索并保持相同的网格ID时，它将引发错误，因为您尝试将在不同数据集上训练的模型附加到同一网格。

R h2o连接（内存）问题

2 个答案: