我正在尝试使用R2中的H2O构建一个堆叠的集成。它有五个基础学习者-随机森林,XGBoost,GLM,GBM和朴素贝叶斯。这是一个具有三个级别的分类问题。基础学习者成功运行并在测试数据集上返回了准确性值。
在h2o.stackedEnsemble中使用基础学习器时,返回以下错误:
错误:water.exceptions.H2OIllegalArgumentException:water.exceptions.H2OIllegalArgumentException:不知道如何确定多项式分类器的分布。
以下是堆叠的集成节的代码段:
ensemble <- h2o.stackedEnsemble(x = setdiff(colnames(trainPCA),
c(depVarsMulti,"weightage")), #Names of indep vars
y = depVarsMulti, #dep var
training_frame = trainPCA,
model_id = "123",
base_models = c(ModelOneRF@model_id,
ModelTwoXGBoost@model_id,ModelThreeGLM@model_id,ModelFourGBM@model_id,ModelFiveBayes@model_id),
metalearner_algorithm = "drf",
metalearner_nfolds = nfolds)
其他详细信息:
我能够构建类似于给定here
H2O版本:“ 3.21.0.4359” | R版本:“ 3.4.1(2017-06-30)”
H2O集群是本地集群
编辑(2018年8月3日):
正如Darren所建议的那样,我正在添加一个脚本,该脚本使用打开的数据集Cars93(来自CARS包)来重现问题
#######################################################################
# Minimum reproducible example for Stackoverflow
#######################################################################
# R version: 3.4.4 (2018-03-15)
# H2O cluster version: 3.21.0.4376
#OS: Linux (Azure Data Science VM)
#Installing and loading necessary libraries
cat("\n Installing and loading necessary libraries \n")
libsNeeded <- c("dplyr", "data.table", "randomForest", "stringr","doParallel", "parallel", "doSNOW", "rlang", "nlme", "MASS", "survival", "stringi", "dummies", "missRanger","cluster", "e1071","xgboost","ranger", "caret")
if(length(setdiff(libsNeeded, rownames(installed.packages()))) > 0){
install.packages(setdiff(libsNeeded, rownames(installed.packages())))
}
lapply(libsNeeded, require, character.only = T)
#Installing latest H2O if not done already:
# install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R")))
library(h2o)
#Starting an H2O cluster
h2o.init(max_mem_size = "23g")
library(CARS)
dataFrame <- Cars93
#Removing rows where Passengers = 8 or 7 or 2, as their occurence frequency is low and for demonstration purpose, want to avoid errors coming because of this reason
dataFrame <- dataFrame[!(dataFrame$Passengers %in% c("2", "7", "8")),]
#Making the dependent variable as factor
dataFrame$Passengers <- as.factor(dataFrame$Passengers)
#Defining the variables to be used in modeling
depVars <- "Passengers"
indepNumVars <- c("Price","MPG.highway","EngineSize","Horsepower")
indepFactVars <- c("AirBags","Type")
#Keeping only columns of interest
dataFrame <- dataFrame[,c(indepFactVars,indepNumVars,depVars)]
#Converting dependent variables into dummy variables:
dataFrame <- dummy.data.frame(dataFrame, names=colnames(dataFrame[,indepFactVars]), sep="_")
names(dataFrame) <- gsub(" ", "_", names(dataFrame))
#Creating the train and test datasets
trainIndex <- createDataPartition(dataFrame[,depVars], times = 1, p = 0.75)
trainingData <- dataFrame[trainIndex$Resample1,]
testingData <- dataFrame[-trainIndex$Resample1,]
# H2O Frames
train <- as.h2o(trainingData)
test <- as.h2o(testingData)
# Perform PCA
depData <- train[, depVars]
train <- train[, setdiff(names(train), c(depVars))]
pca_model <- h2o.prcomp(training_frame = train,
model_id = NULL,
ignore_const_cols = TRUE,
transform = "STANDARDIZE",
pca_method = "GramSVD",
k = 10,
max_iterations = 5000,
seed = -1,
score_each_iteration = TRUE,
use_all_factor_levels = FALSE,
compute_metrics = TRUE,
max_runtime_secs = 0,
impute_missing = T)
cum_prop <- pca_model@model$model_summary["Cumulative Proportion", ]
# print(cum_prop)
cum_prop_to_consider <- length(cum_prop[cum_prop < .95]) + 1
cat("\n\n Number of principal components that explain 95% variance = ",cum_prop_to_consider,"\n\n")
trainPCA <- h2o.predict(pca_model, train)
if(cum_prop_to_consider > ncol(trainPCA)){
trainPCA <- trainPCA[, 1:(cum_prop_to_consider - 1)]
}else{
trainPCA <- trainPCA[, 1:cum_prop_to_consider]
}
# pca_data <- as.data.table(pca_data)
trainPCA[, depVars] <- depData[, depVars]
#Preparing the test data:
testPCA <- h2o.predict(pca_model,test)
if(cum_prop_to_consider > ncol(testPCA)){
testPCA <- testPCA[, 1:(cum_prop_to_consider - 1)]
}else{
testPCA <- testPCA[, 1:cum_prop_to_consider]
}
testPCA[, depVars] <- test[, depVars]
# For binary classification, response should be a factor
trainPCA[,depVars] <- as.factor(trainPCA[,depVars])
testPCA[,depVars] <- as.factor(test[,depVars])
#Weights of the training data:
trainPCA$weightage <- ifelse(trainPCA[,depVars] == "5", 1, ifelse(trainPCA[,depVars] == "4", 2, ifelse(trainPCA[,depVars] == "6", 2,1)))
# Number of CV folds (to generate level-one data for stacking)
nfolds <- 5
####################################################################################################
# Stacked Ensemble modeling
####################################################################################################
modelIteration <- Sys.Date()
modelIteration <- gsub("-", "_", modelIteration)
i = "withInsp"
# Train & Cross-validate a RF
ModelOneRF <- h2o.randomForest(x = setdiff(colnames(trainPCA),depVars),
y = depVars,
training_frame = trainPCA,
ntrees = 15,
nfolds = nfolds,
fold_assignment = "Stratified",
max_depth = 30,
min_rows = 1,
mtries = 3,
keep_cross_validation_predictions = TRUE,
seed = 1,
# verbose = T,
weights_column = "weightage",
model_id = paste0(i,"_ModelOneRF_",modelIteration))
cat("\n\n Mean accuracy of Random Forest Model (on cross validation):",ModelOneRF@model$cross_validation_metrics_summary[1,1],"\n\n")
perf_RF <- h2o.performance(model = ModelOneRF, newdata = testPCA)
cat("\n\n Accuracy of Random Forest Model (on test data):",1 - perf_RF@metrics$mean_per_class_error,"\n\n")
# Train & Cross-validate a XGBoost
ModelTwoXGBoost <- h2o.xgboost(x = setdiff(colnames(trainPCA),depVars),
y = depVars,
training_frame = trainPCA,
nfolds = nfolds,
fold_assignment = "Stratified",
weights_column = "weightage",
ntrees = 15,
max_depth = 20,
min_rows = 1,
learn_rate = 0.1,
eta = 0.3,
keep_cross_validation_predictions = TRUE,
seed = 1,
# verbose = T,
model_id = paste0(i,"_ModelTwoXGBoost_",modelIteration))
cat("\n\n Mean accuracy of XGBoost Model (on cross validation):",ModelTwoXGBoost@model$cross_validation_metrics_summary[1,1],"\n\n")
perf_XGBoost <- h2o.performance(model = ModelTwoXGBoost, newdata = testPCA)
cat("\n\n Accuracy of XGBoost Model (on test data):",1 - perf_XGBoost@metrics$mean_per_class_error,"\n\n")
#Train and cross validate a Generalized Linear Model (GLM)
ModelThreeGLM <- h2o.glm(family= "multinomial",
x = setdiff(colnames(trainPCA),depVars),
y = depVars,
training_frame = trainPCA,
nfolds = nfolds,
fold_assignment = "Stratified",
weights_column = "weightage",
alpha = 0.0,
lambda_search = T,
standardize = T,
seed = 1,
# verbose = T,
model_id = paste0(i,"_ModelThreeGLM_",modelIteration),
keep_cross_validation_predictions = TRUE)
cat("\n\n Mean accuracy of GLM Model (on cross validation):",ModelThreeGLM@model$cross_validation_metrics_summary[1,1],"\n\n")
perf_GLM <- h2o.performance(model = ModelThreeGLM, newdata = testPCA)
cat("\n\n Accuracy of GLM Model (on test data):",1 - perf_GLM@metrics$mean_per_class_error,"\n\n")
#Train and cross validate a Gradient Boosting Machine (GBM)
ModelFourGBM <- h2o.gbm(x = setdiff(colnames(trainPCA),depVars),
y = depVars,
training_frame = trainPCA,
nfolds = nfolds,
fold_assignment = "Stratified",
weights_column = "weightage",
ntrees = 10,
max_depth = 20,
seed = 1,
learn_rate = 0.05,
learn_rate_annealing = 0.99,
# verbose = T,
keep_cross_validation_predictions = TRUE,
model_id = paste0(i,"_ModelFourGBM_",modelIteration))
cat("\n\n Mean accuracy of GBM Model (on cross validation):",ModelFourGBM@model$cross_validation_metrics_summary[1,1],"\n\n")
perf_GBM <- h2o.performance(model = ModelFourGBM, newdata = testPCA)
cat("\n\n Accuracy of GBM Model (on test data):",1 - perf_GBM@metrics$mean_per_class_error,"\n\n")
#Train and cross validate a Naïve Bayes Model
ModelFiveBayes <- h2o.naiveBayes(x = setdiff(colnames(trainPCA),c(depVars,"weightage")),
y = depVars,
training_frame = trainPCA,
nfolds = nfolds,
fold_assignment = "Stratified",
# weights_column = "weightage",
seed = 1,
# verbose = T,
keep_cross_validation_predictions = TRUE,
model_id = paste0(i,"_ModelFiveBayes_",modelIteration))
cat("\n\n Mean accuracy of Naive Bayes Model (on cross validation):",ModelFiveBayes@model$cross_validation_metrics_summary[1,1],"\n\n")
perf_Bayes <- h2o.performance(model = ModelFiveBayes, newdata = testPCA)
cat("\n\n Accuracy of Naive Bayes Model (on test data):",1 - perf_Bayes@metrics$mean_per_class_error,"\n\n")
# Train a stacked ensemble using the GBM and RF above
ensemble <- h2o.stackedEnsemble(x = setdiff(colnames(trainPCA),c(depVars,"weightage")),
y = depVars,
training_frame = trainPCA,
# model_id = paste0(i,"_ModelEnsemble_",modelIteration),
model_id = paste0(i,"_ModelEnsemble_2_",modelIteration),
base_models = c(ModelOneRF@model_id, ModelTwoXGBoost@model_id,ModelThreeGLM@model_id,ModelFourGBM@model_id,ModelFiveBayes@model_id),
metalearner_algorithm = "drf",
metalearner_nfolds = nfolds)
答案 0 :(得分:2)
这似乎是一个错误(我提交了错误报告here)。似乎对于XGBoost或Naive Bayes而言,多项式情况都不起作用(我们缺少这两种情况的测试范围)。如果您运行下面的代码以删除这两个模型,那么它将起作用。我们将尽快修复。谢谢。
ensemble <- h2o.stackedEnsemble(x = setdiff(colnames(trainPCA),c(depVars,"weightage")),
y = depVars,
training_frame = trainPCA,
base_models = c(ModelThreeGLM@model_id, ModelFourGBM@model_id, ModelOneRF@model_id),
metalearner_algorithm = "drf",
metalearner_nfolds = nfolds)
编辑:该错误是fixed and merged的错误。它将在nightly release(今晚开始)(2018年8月7日)或下一个修订版本3.20.0.5(在接下来的几天中发布)中提供。