我尝试在R
中设置xgboost分类模型但是我得到了不同的结果(例如测试数据集的不同预测),尽管我使用相同的设置(?)。 我使用了mlbench库中的Ionosphere数据集,并尝试使此示例尽可能简单(没有交叉验证,参数调整等):
有人知道为什么我会得到不同的结果(见下文)。由于插入符号只是xgboost的包装("它只调用相同的xgboost包"),结果应该完全相同。
library(caret)
library(xgboost)
library(mlbench)
#####
###Load & Prepare Data
#####
data(Ionosphere)
dataset <- Ionosphere
dataset <- dataset[,-c(2)] #remove (Constant everywhere)
dataset$V1 <- as.numeric(as.character(dataset$V1)) #factor to numeric
dataset$Class<-ifelse(dataset$Class=="good",dataset$Class<-1,dataset$Class<-0) #convert good->1 and bad->0
dataset$Class<-as.factor(dataset$Class) #convert to factor
#####
###Create Train & Test Dataset
#####
set.seed(1992)
validation_index<-createDataPartition(dataset$Class, p=0.8, list=FALSE)
testSet<-dataset[-validation_index,]
trainSet<-dataset[validation_index,]
#xgb.DMatrix for xgb.train() ("trainSet[c(34)]=trainSet$Class")
xgb.trainData<-xgb.DMatrix(data = data.matrix(trainSet[,-c(34)]), label = data.matrix(trainSet$Class))
xgb.testData<-xgb.DMatrix(data = as.matrix(testSet[,-c(34)]), label = as.matrix(data.matrix(testSet$Class)))
#####
###Set parameters & create models
#####
#params
param <- data.frame(nrounds=c(100), max_depth = c(2),eta =c(0.3),gamma=c(0),
colsample_bytree=c(0.8),min_child_weight=c(1),subsample=c(1))
#xgboost
set.seed(1992)
fit.xgb <- xgb.train(
params = list(eta = param$eta, max_depth = param$max_depth,
gamma = param$gamma, colsample_bytree = param$colsample_bytree,
min_child_weight = param$min_child_weight, subsample = param$subsample),
data = xgb.trainData, nrounds = param$nrounds, objective = "binary:logistic")
#caret
set.seed(1992)
fit.xgbTree <- train(Class~., data=trainSet, method="xgbTree",
metric="Accuracy", trControl=trainControl(method="none"),tuneGrid=param)
#####
###Print results (predictions)
#####
print("xgboost")
predictionxgb <- as.numeric(predict(fit.xgb, xgb.testData) >= 0.5)
confusionMatrix(predictionxgb,testSet$Class)
#Confusion Matrix and Statistics
#
# Reference
#Prediction 0 1
# 0 18 0
# 1 7 45
# ...
print("caret")
predictionsxgbTree <- predict(fit.xgbTree, testSet)
confusionMatrix(predictionsxgbTree, testSet$Class)
#Confusion Matrix and Statistics
#
# Reference
#Prediction 0 1
# 0 17 0
# 1 8 45
# ...
关于模型的这些信息也可能有所帮助(我没有看到重要的区别):
#xgboost:
fit.xgb
##### xgb.Booster
#raw: 35.9 Kb
#call:
# xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
# gamma = param$gamma, colsample_bytree = param$colsample_bytree,
# min_child_weight = param$min_child_weight, subsample = param$subsample),
# data = xgb.trainData, nrounds = param$nrounds, objective = "binary:logistic")
#params (as set within xgb.train):
# eta = "0.3", max_depth = "2", gamma = "0", colsample_bytree = "0.8", min_child_weight = "1", subsample = "1", objective = "binary:logistic", silent = "1"
#xgb.attributes:
# niter
#callbacks:
# cb.print.evaluation(period = print_every_n)
#niter: 100
#caret:
fit.xgbTree$finalModel
##### xgb.Booster
#raw: 36 Kb
#call:
# xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
# gamma = param$gamma, colsample_bytree = param$colsample_bytree,
# min_child_weight = param$min_child_weight, subsample = param$subsample),
# data = x, nrounds = param$nrounds, objective = "binary:logistic")
#params (as set within xgb.train):
# eta = "0.3", max_depth = "2", gamma = "0", colsample_bytree = "0.8", min_child_weight = "1", subsample = "1", objective = "binary:logistic", silent = "1"
#xgb.attributes:
# niter
#callbacks:
# cb.print.evaluation(period = print_every_n)
#niter: 100
#xNames: #V1V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28V29V30V31V32V33V34
#problemType: Classification
#tuneValue:
# nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
#1 100 2 0.3 0 0.8 1 1
#obsLevels: 01
#param:
# list()