不同的结果:R中的“xgboost”与“caret”

时间:2018-02-28 09:10:35

标签: r machine-learning classification r-caret xgboost

我尝试在R

中设置xgboost分类模型
  1. 使用xgboost库
  2. with caret
  3. 但是我得到了不同的结果(例如测试数据集的不同预测),尽管我使用相同的设置(?)。 我使用了mlbench库中的Ionosphere数据集,并尝试使此示例尽可能简单(没有交叉验证,参数调整等):

    有人知道为什么我会得到不同的结果(见下文)。由于插入符号只是xgboost的包装("它只调用相同的xgboost包"),结果应该完全相同。

    library(caret)
    library(xgboost)
    library(mlbench)
    
    #####
    ###Load & Prepare Data
    #####
    data(Ionosphere)
    dataset <- Ionosphere
    dataset <- dataset[,-c(2)] #remove (Constant everywhere)
    dataset$V1 <- as.numeric(as.character(dataset$V1)) #factor to numeric
    dataset$Class<-ifelse(dataset$Class=="good",dataset$Class<-1,dataset$Class<-0) #convert good->1 and bad->0
    dataset$Class<-as.factor(dataset$Class) #convert to factor
    
    #####
    ###Create Train & Test Dataset
    #####
    set.seed(1992)
    validation_index<-createDataPartition(dataset$Class, p=0.8, list=FALSE)
    testSet<-dataset[-validation_index,]
    trainSet<-dataset[validation_index,]
    
    #xgb.DMatrix for xgb.train() ("trainSet[c(34)]=trainSet$Class")
    xgb.trainData<-xgb.DMatrix(data = data.matrix(trainSet[,-c(34)]), label = data.matrix(trainSet$Class))
    xgb.testData<-xgb.DMatrix(data = as.matrix(testSet[,-c(34)]), label = as.matrix(data.matrix(testSet$Class)))
    
    #####
    ###Set parameters & create models
    #####
    #params
    param <-  data.frame(nrounds=c(100), max_depth = c(2),eta =c(0.3),gamma=c(0),
                        colsample_bytree=c(0.8),min_child_weight=c(1),subsample=c(1))                           
    
    #xgboost
    set.seed(1992)
    fit.xgb <- xgb.train(
            params = list(eta = param$eta, max_depth = param$max_depth, 
                gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
                min_child_weight = param$min_child_weight, subsample = param$subsample),    
            data = xgb.trainData, nrounds = param$nrounds, objective = "binary:logistic")
    
    #caret
    set.seed(1992)
    fit.xgbTree <- train(Class~., data=trainSet, method="xgbTree",
                        metric="Accuracy", trControl=trainControl(method="none"),tuneGrid=param)
    
    #####
    ###Print results (predictions)
    #####
    print("xgboost")
    predictionxgb <- as.numeric(predict(fit.xgb,  xgb.testData) >= 0.5)
    confusionMatrix(predictionxgb,testSet$Class)
    #Confusion Matrix and Statistics
    #
    #          Reference
    #Prediction  0  1
    #         0 18  0
    #         1  7 45
    # ...
    
    
    print("caret")
    predictionsxgbTree <- predict(fit.xgbTree, testSet)
    confusionMatrix(predictionsxgbTree, testSet$Class)
    #Confusion Matrix and Statistics
    #
    #          Reference
    #Prediction  0  1
    #         0 17  0
    #         1  8 45
    # ...
    

    关于模型的这些信息也可能有所帮助(我没有看到重要的区别):

    #xgboost:
    fit.xgb
    ##### xgb.Booster
    #raw: 35.9 Kb 
    #call:
    #  xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
    #    gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
    #    min_child_weight = param$min_child_weight, subsample = param$subsample), 
    #    data = xgb.trainData, nrounds = param$nrounds, objective = "binary:logistic")
    #params (as set within xgb.train):
    #  eta = "0.3", max_depth = "2", gamma = "0", colsample_bytree = "0.8", min_child_weight = "1", subsample = "1", objective = "binary:logistic", silent = "1"
    #xgb.attributes:
    #  niter
    #callbacks:
    #  cb.print.evaluation(period = print_every_n)
    #niter: 100
    
    
    #caret:
    fit.xgbTree$finalModel
    ##### xgb.Booster
    #raw: 36 Kb 
    #call:
    #  xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
    #   gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
    #   min_child_weight = param$min_child_weight, subsample = param$subsample), 
    #    data = x, nrounds = param$nrounds, objective = "binary:logistic")
    #params (as set within xgb.train):
    #  eta = "0.3", max_depth = "2", gamma = "0", colsample_bytree = "0.8", min_child_weight = "1", subsample = "1", objective = "binary:logistic", silent = "1"
    #xgb.attributes:
    #  niter
    #callbacks:
    #  cb.print.evaluation(period = print_every_n)
    #niter: 100
    #xNames: #V1V3V4V5V6V7V8V9V10V11V12V13V14V15V16V17V18V19V20V21V22V23V24V25V26V27V28V29V30V31V32V33V34
    #problemType: Classification
    #tuneValue:
    #          nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
    #1     100         2 0.3     0              0.8                1         1
    #obsLevels: 01
    #param:
    #        list()
    

0 个答案:

没有答案