我正在对分类模型进行网格搜索。当h20服务器启动时,我得到:
R is connected to the H2O cluster:
H2O cluster uptime: 9 minutes 35 seconds
H2O cluster version: 3.10.4.8
H2O cluster version age: 14 days, 4 hours and 1 minute
H2O cluster name: H2O_started_from_R_Charles_huu844
H2O cluster total nodes: 1
H2O cluster total memory: 21.31 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 54321
H2O Connection proxy: NA
H2O Internal Security: FALSE
R Version: R version 3.2.2 (2015-08-14)
当我的模型进入网格搜索时,我收到以下错误消息:
ERROR: Unexpected HTTP Status code: 412 Precondition Failed (url = http://localhost:54321/99/Grids/mygrid?sort_by=auc&decreasing=TRUE)
water.exceptions.H2OIllegalArgumentException
[1] "water.exceptions.H2OIllegalArgumentException: Invalid argument for sort_by specified. Must be one of: [r2, mean_per_class_accuracy, max_per_class_error, err, total_rows, rmse, accuracy, err_count, logloss, mse, mean_per_class_error]"
[2] " hex.schemas.GridSchemaV99.fillFromImpl(GridSchemaV99.java:114)"
我的R脚本如下:
rm(list=ls())
options(scipen=999) # remove E notation
ptm <- proc.time()
if (Sys.info()["sysname"] == "Windows") {
filePath = "//bigsur/sm/Trending/model.csv"
homedir = "c:/sm/Trending/"
setwd("c:/sm/Trending/R")
sink("C:/Users/Charles/Desktop/log.txt")
} else {
filePath = "/Volumes/sm/Trending/model.csv"
homedir = "/Volumes/sm/Trending/"
setwd("/Volumes/sm/Trending/R")
sink("~/Desktop/log.txt")
}
#install.packages("ggplot2")
#install.packages("dplyr")
sink.reset <- function(){
for(i in seq_len(sink.number())){
sink(NULL)
}
}
printf <- function(...) print(sprintf(...))
results = function(title, best_model) {
best_params = best_model@parameters
best_activation <<- best_model@parameters$activation
best_hidden <<- best_model@parameters$hidden
best_l1 <<- best_model@parameters$l1
best_l2 <<- best_model@parameters$l2
best_input_dropout_ratio <<- best_model@parameters$input_dropout_ratio
best_hidden_dropout_ratios <<- best_model@parameters$hidden_dropout_ratios
printf(" ")
printf("%s", title)
best_model
plot(best_model)
h2o.performance(best_model)
h2o.performance(best_model, valid = TRUE)
h2o.mse(best_model, valid = TRUE)
printf("mse: %f", best_model@model$validation_metrics@metrics$MSE)
printf("best activation: %s", best_activation)
cat("best hidden layers: ", best_hidden, "\n")
printf("Best l1: %f", best_l1)
printf("Best l2: %f", best_l2)
#printf("best_input_dropout_ratio: %f", best_input_dropout_ratio)
#cat("Best best_hidden_dropout_ratios: ", best_hidden_dropout_ratios, "\n")
predictions = h2o.predict(best_model, test)
summary(predictions, exact_quantiles=TRUE)
predicted = predictions[,1]
test_targets = test[, 5]
correct = predicted == test_targets
numCorrect = as.integer(sum(correct))
ntotal = as.integer(nrow(correct))
percent = round(numCorrect/ntotal*100, 2)
printf("Correct classifications on all data: %d/%d (%f)", numCorrect, ntotal, percent)
#predicted.h2o = h2o.assign(predicted, key = "predicted.h2o")
#correct.h2o = h2o.assign(correct, key = "correct.h2o")
perf_test = h2o.performance(model = best_model, newdata = test)
cat("\nPerformance on test dataset\n")
print(perf_test)
cat("\nConfusion matrix on test dataset\n")
h2o.confusionMatrix(perf_test)
# Plot Receiver Operating Characteristic (ROC) curve and find its Area Under the Curve (AUC)
# A ROC curve is a graph of the true positive rate (recall) against the false positive
# rate for a binary classifier.
#plot(f1_best_model, type = "cutoffs", col = "blue")
cm = h2o.confusionMatrix(best_model, train)
print("Confusion Matrix: ")
print(cm)
true_negative = cm[1,1]
true_positive = cm[2,2]
false_negative = cm[2,1]
false_positive = cm[1,2]
total = true_negative + true_positive + false_negative + false_positive
accuracy = (true_positive + true_negative)/total
printf("accuracy: %f", accuracy)
misclassification_rate = (false_positive + false_negative)/total
printf("misclassification_rate: %f", misclassification_rate)
cat("\nVariable importance\n")
print(best_model@model$variable_importance)
return_list = c(best_activation,
best_hidden,
best_l1,
best_l2,
best_input_dropout_ratio,
best_hidden_dropout_ratios)
return (return_list)
}
library(h2o)
library(dplyr)
library(data.table)
library(ggplot2)
localH2O = h2o.init(ip = "localhost",
port = 54321,
startH2O = TRUE,
max_mem_size="24G",
nthreads = -1)
h2o.no_progress()
h2o.removeAll() ## clean slate - just in case the cluster was already running
print(filePath)
model.full <- read.csv(filePath, header = TRUE, sep = ",")
head(model.full)
remove = !colnames(model.full) %in% c("Date",
"Symbol",
"BuyIndex",
"SellIndex",
"BoxRatio",
"Acceleration",
"nPosVelo",
"Gain")
model_orig = model.full[, remove]
head(model_orig)
model = model_orig[sample(nrow(model_orig)),] # shuffle the rows
head(model)
df <- as.h2o(model, destination_frame = "df")
splits <- h2o.splitFrame(df, c(0.6,0.2), seed=1234)
train <- h2o.assign(splits[[1]], "train.hex") # 60%
valid <- h2o.assign(splits[[2]], "valid.hex") # 20%
test <- h2o.assign(splits[[3]], "test.hex") # 20%
printf("train----------------------------------------")
head(train)
train
printf("valid----------------------------------------")
#head(valid)
valid
printf("test-----------------------------------------")
#head(test)
test
p1 = train$Thrust
p2 = train$Velocity
p3 = train$OnBalRun
p4 = train$vwapGain
p1d = rbind(lapply(p1, as.double)) # p1 is an environment variable, we need doubles
p2d = rbind(lapply(p2, as.double))
p3d = rbind(lapply(p3, as.double))
p4d = rbind(lapply(p4, as.double))
a = unlist(p1d)
b = unlist(p2d)
c = unlist(p3d)
d = unlist(p4d)
pairs(train[1:4], main = "Scatterplot of predictors", pch = 21, cex = 0.8, bg = c("green3", "red")[unclass(model$Altitude)])
cat("\n\n1. Summary of train dataset------------------------------------------------------\n")
summary(train, exact_quantiles=TRUE)
cat("\n\n2. Grid Search on valid data ----------------------------------------------------\n")
activation_opts = c("RectifierWithDropout","TanhWithDropout","MaxoutWithDropout")
hidden_opts = list(c(80),c(100),c(200),c(300),c(400),c(500),
c(80,80),c(100,100),c(200,200),c(300,300),c(400,400),c(500,500),
c(80,80,80,80),c(100,100,100,100),c(200,200,200,200),c(300,300,300,300),
c(80,80,80,80,80),c(100,100,100,100,100),c(200,200,200,200,200),
c(300,300,300,300,300),c(400,400,400,400,400),c(500,500,500,500,500),
c(80,80,80,80,80,80),c(100,100,100,100,100,100),c(200,200,200,200,200,200),
c(300,300,300,300,300,300),c(400,400,400,400,400,400),
c(500,500,500,500,500,500)
)
l1_opts = runif(1, 0, 0.0001)
l2_opts = runif(1, 0, 0.0001)
hyperparams = list(
activation = activation_opts,
hidden = hidden_opts,
l1 = l1_opts,
l2 = l2_opts,
max_w2 = 10
)
search_criteria = list(strategy = "RandomDiscrete",
stopping_metric = "misclassification",
max_models = 10000,
max_runtime_secs = 72000,
stopping_tolerance = 0.00001,
stopping_rounds = 10)
grid_model = h2o.grid(algorithm = "deeplearning",
grid_id = "mygrid",
hyper_params = hyperparams,
search_criteria = search_criteria,
x = 1:4,
y = 5,
training_frame = train,
validation_frame = valid,
variable_importances = TRUE,
balance_classes=TRUE,
score_training_samples=1000,
score_validation_samples=1000,
score_validation_sampling="Stratified",
epochs = 1000000,
seed = 7)
cat("\n\n3. Summary of grid_model---------------------------------------------------------\n")
grid = h2o.getGrid("mygrid", sort_by="auc", decreasing=TRUE)
summary(grid)
cat("\n\n4. Performance of best_model-----------------------------------------------------\n")
best_model = h2o.getModel(grid@model_ids[[1]])
results("5.best_model", best_model)
cat("\n\n6. Performance valid dataset-----------------------------------------------------\n")
perf_valid = h2o.performance(model = best_model, newdata = valid)
perf_valid
cat("\n\n7. Performance on test dataset---------------------------------------------------\n")
perf_test = h2o.performance(model = best_model, newdata = test)
perf_test
plot(perf_test, type="roc") # Plot the roc curve
predicted <- h2o.predict( best_model, test )
actual = test[,5]
cat("\n\n8. Mean prediction on the test set: ", 100*mean( predicted$predict == actual ), "%\n")
correct = predicted == actual
numCorrect = as.integer(sum(correct))
ntotal = as.integer(nrow(correct))
percent = round(numCorrect/ntotal*100, 2)
printf("9. Test of Mean prediction on the test set: %d/%d (%f)", numCorrect, ntotal, percent)
cat("\n\n10. The 'test' set auc is: ", h2o.auc(perf_test), "\n")
p = h2o.saveModel(best_model, path = "C:\\sm\\Trending\\h2o_model", force = TRUE)
p
minutes = (proc.time() - ptm)[1]
printf("Elapsed time: %.2f minutes", minutes)
unlink("log.txt")
sink.reset()
我的日志是:
Connection successful!
R is connected to the H2O cluster:
H2O cluster uptime: 9 minutes 35 seconds
H2O cluster version: 3.10.4.8
H2O cluster version age: 14 days, 4 hours and 1 minute
H2O cluster name: H2O_started_from_R_Charles_huu844
H2O cluster total nodes: 1
H2O cluster total memory: 21.31 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: TRUE
H2O Connection ip: localhost
H2O Connection port: 54321
H2O Connection proxy: NA
H2O Internal Security: FALSE
R Version: R version 3.2.2 (2015-08-14)
[1] 0
[1] "//bigsur/sm/Trending/model.csv"
Date Symbol BuyIndex SellIndex BoxRatio Thrust Acceleration Velocity nPosVelo
1 4/5/2017 GBX 132 199 77.724 49.7190 107.5811 2.9236 59
2 1/5/2017 RH 216 259 4.838 4.8380 137.0574 0.3381 14
3 9/28/2016 DDS 149 444 0.150 0.4860 99.1360 0.1081 59
4 11/25/2016 JKS 132 186 0.932 0.8686 38.9931 0.6057 59
5 12/12/2016 JNUG 132 177 0.582 0.3242 87.1144 1.1274 18
6 3/7/2017 LPL 134 180 8.954 8.9540 34.1691 0.4778 59
OnBalRun vwapGain Gain Altitude
1 12.6378 90.0656 -0.1659 no
2 1.5157 0.9495 -0.6496 no
3 0.5476 2.2887 4.1439 yes
4 3.2719 3.8235 0.5051 no
5 2.5006 12.0472 -0.8942 no
6 2.9935 1.1234 -0.1617 no
Thrust Velocity OnBalRun vwapGain Altitude
1 49.7190 2.9236 12.6378 90.0656 no
2 4.8380 0.3381 1.5157 0.9495 no
3 0.4860 0.1081 0.5476 2.2887 yes
4 0.8686 0.6057 3.2719 3.8235 no
5 0.3242 1.1274 2.5006 12.0472 no
6 8.9540 0.4778 2.9935 1.1234 no
Thrust Velocity OnBalRun vwapGain Altitude
4427 0.9370 0.3176 1.2786 2.3151 no
3079 2.2060 0.9261 1.1257 1.2506 no
3952 0.0702 0.4430 1.1485 0.9928 no
7765 1.1596 1.1067 6.2563 2.1164 yes
1682 0.6708 0.4519 1.3848 2.1808 no
5145 4.5600 0.3462 1.7386 0.7722 no
[1] "train----------------------------------------"
Thrust Velocity OnBalRun vwapGain Altitude
1 2.2060 0.9261 1.1257 1.2506 no
2 0.6708 0.4519 1.3848 2.1808 no
3 4.5600 0.3462 1.7386 0.7722 no
4 3.6930 3.2778 11.4092 49.3335 no
5 0.9980 0.4035 1.6667 1.1264 no
6 0.2016 0.5627 2.4101 1.2642 no
Thrust Velocity OnBalRun vwapGain Altitude
1 2.2060 0.9261 1.1257 1.2506 no
2 0.6708 0.4519 1.3848 2.1808 no
3 4.5600 0.3462 1.7386 0.7722 no
4 3.6930 3.2778 11.4092 49.3335 no
5 0.9980 0.4035 1.6667 1.1264 no
6 0.2016 0.5627 2.4101 1.2642 no
[5548 rows x 5 columns]
[1] "valid----------------------------------------"
Thrust Velocity OnBalRun vwapGain Altitude
1 0.9370 0.3176 1.2786 2.3151 no
2 0.0702 0.4430 1.1485 0.9928 no
3 1.0230 0.3119 3.0922 0.8788 no
4 6.4100 0.9966 5.3490 2.9436 yes
5 6.9620 0.7004 3.5810 4.8905 no
6 1.6800 1.4518 5.1933 1.7955 no
[1875 rows x 5 columns]
[1] "test-----------------------------------------"
Thrust Velocity OnBalRun vwapGain Altitude
1 1.1596 1.1067 6.2563 2.1164 yes
2 4.7010 0.5369 1.1266 7.5566 no
3 1.7110 0.9247 3.5819 3.0598 no
4 1.4620 0.3315 4.3097 0.4129 no
5 0.5610 0.4494 1.8738 1.3942 no
6 6.7255 1.7309 5.6268 4.4937 yes
[1823 rows x 5 columns]
1. Summary of train dataset------------------------------------------------------
Thrust Velocity OnBalRun vwapGain Altitude
Min. : -1.4845 Min. :-0.1241 Min. : -0.5299 Min. : -4.7648 no :4875
1st Qu.: 0.3984 1st Qu.: 0.3281 1st Qu.: 1.1468 1st Qu.: 0.8684 yes: 671
Median : 1.0425 Median : 0.3815 Median : 1.6439 Median : 1.9954 : 2
Mean : 12.1332 Mean : 0.5913 Mean : 2.5911 Mean : 4.8876
3rd Qu.: 1.9723 3rd Qu.: 0.5716 3rd Qu.: 2.6507 3rd Qu.: 4.7500
Max. :41279.8960 Max. :29.4449 Max. :154.5988 Max. :314.7143
NA's :2 NA's :2 NA's :2 NA's :2
2. Grid Search on valid data ----------------------------------------------------
3. Summary of grid_model---------------------------------------------------------
ERROR: Unexpected HTTP Status code: 412 Precondition Failed (url = http://localhost:54321/99/Grids/mygrid?sort_by=auc&decreasing=TRUE)
water.exceptions.H2OIllegalArgumentException
[1] "water.exceptions.H2OIllegalArgumentException: Invalid argument for sort_by specified. Must be one of: [r2, mean_per_class_accuracy, max_per_class_error, err, total_rows, rmse, accuracy, err_count, logloss, mse, mean_per_class_error]"
[2] " hex.schemas.GridSchemaV99.fillFromImpl(GridSchemaV99.java:114)"
[3] " water.api.GridsHandler.fetch(GridsHandler.java:41)"
[4] " sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)"
[5] " sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)"
[6] " sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)"
[7] " java.lang.reflect.Method.invoke(Method.java:498)"
[8] " water.api.Handler.handle(Handler.java:61)"
[9] " water.api.RequestServer.serve(RequestServer.java:436)"
[10] " water.api.RequestServer.doGeneric(RequestServer.java:285)"
[11] " water.api.RequestServer.doGet(RequestServer.java:220)"
[12] " javax.servlet.http.HttpServlet.service(HttpServlet.java:735)"
[13] " javax.servlet.http.HttpServlet.service(HttpServlet.java:848)"
[14] " org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:684)"
[15] " org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:503)"
[16] " org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1086)"
[17] " org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:429)"
[18] " org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1020)"
[19] " org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)"
[20] " org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)"
[21] " org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)"
[22] " water.JettyHTTPD$LoginHandler.handle(JettyHTTPD.java:417)"
[23] " org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)"
[24] " org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)"
[25] " org.eclipse.jetty.server.Server.handle(Server.java:370)"
[26] " org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:494)"
[27] " org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)"
[28] " org.eclipse.jetty.server.AbstractHttpConnection.headerComplete(AbstractHttpConnection.java:971)"
[29] " org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.headerComplete(AbstractHttpConnection.java:1033)"
[30] " org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:644)"
[31] " org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:235)"
[32] " org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)"
[33] " org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)"
[34] " org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)"
[35] " org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)"
[36] " java.lang.Thread.run(Thread.java:745)"
4. Performance of best_model-----------------------------------------------------
6. Performance valid dataset-----------------------------------------------------
7. Performance on test dataset---------------------------------------------------
Called from: sprintf(...)
[1] "Elapsed time: 85.98 minutes"
请帮忙。
查尔斯
答案 0 :(得分:2)
AUC是用于分类的指标,您可以根据我的观点创建一个~refression~模型,因此您无法在auc
中使用grid = h2o.getGrid("mygrid", sort_by="auc", decreasing=TRUE)
。而是使用,如错误消息所示,使用以下指标之一:r2, mean_per_class_accuracy, max_per_class_error, err, total_rows, rmse, accuracy, err_count, logloss, mse, mean_per_class_error
。