XGBoost预测所有数据

时间:2017-08-29 09:31:57

标签: r pca xgboost

我有带预测变量的数据集,它有35446行和38列目标。

我制作火车和测试装置:

I would use struct as I need to call the static func:

struct BaseString {

    static func createBaseString(authPrefix,signMethod,urlPath,nonce, timestamp,delimeter="&", bool sort= true, bool quote = false) -> String? {
        var dict = [String:String]()
        let url = NSURL(string: urlPath)

        var keyValues = url.query?.componentsSeparatedByString("&")

        //-(1)- adding keyValue into Dictinary 
        dict.??

        //-- how to add the data below?
        //- after (1) : Add other key value into same Dictionary
        dict Add(authPrefix + "_timestamp", timestamp);
        dict.Add(authPrefix + "_nonce", nonce);
        dict.Add(authPrefix + "_signature_method", signMethod);
        dict.Add(authPrefix + "_version", "1.0");

        var return_format:String 

        if quote == true{
            //-- create a baseString 
            sort the Dictionary
            return_format = "&" + url + "&" +Dictionary.ToString()
            (format: String = "q ="V1" "  the value with double quote)
        }else{
            //-- create a baseString 
            sort the Dictionary
            return_format = Dictionary.ToString()
            (format:Strig = " q=v2")
        }

        var baseString = return_format
        return baseString

    }
}

之后,我想用PCA减少数据的维数。

data_for_predict <- res
data_good <- data_for_predict%>%filter(target == 1)
data_bad <- data_for_predict%>%filter(target == 0)
set.seed(789)
size_bad <- floor(1 * nrow(data_good))
data_ind <- sample(seq_len(nrow(data_bad)), size = size_bad)
data_bad <- data_bad[data_ind, ]
data_for_predict <- rbind(data_good, data_bad)
data_for_predict <- data_for_predict[sample(1:nrow(data_for_predict)),]
goal <- as.data.frame(data_for_predict$target)
data_for_predict <- data_for_predict%>%select(-target)

然后我将数据分解为训练和测试样本

 PCA <- prcomp(data_for_predict, scale. = TRUE)
 PCA <- as.data.frame(PCA$x)
 data_for_predict <- cbind(data_for_predict, PCA)
 data_for_predict <- as.data.frame(data_for_predict)
 data_for_predict$target <- target$`data_for_predict$target`

现在我准备培训数据

smp_size <- floor(0.8 * nrow(data_for_predict))
set.seed(123)
train_ind <- sample(seq_len(nrow(data_for_predict)), size = smp_size)

train <- data_for_predict[train_ind, ]
rownames(train) <- seq(length=nrow(train)) 
test <- data_for_predict[-train_ind, ]
rownames(test) <- seq(length=nrow(test)) 
names(test) <- make.names(names(test))
names(train) <- make.names(names(train))

适合:

setDT(train)
setDT(test)
labels <- train$target
ts_label <- test$target
new_tr <- model.matrix(~.+0,data = train[,-c("target"),with=F]) 
new_ts <- model.matrix(~.+0,data = test[,-c("target"),with=F])
dtrain <- xgb.DMatrix(data = new_tr,label = labels) 
dtest <- xgb.DMatrix(data = new_ts,label=ts_label)

我得到了一个好结果:

params <- list(booster = "gbtree", objective = "binary:logistic", eta=0.3, gamma=0, max_depth=10, min_child_weight=1, subsample=1, colsample_bytree=1)


xgbcv <- xgb.cv(params = params, data = dtrain, nrounds = 1000, nfold = 5, showsd = T, stratified = T, print_every_n = 10, 
                early_stopping_round = 20, maximize = F, eval_metric = "error")
xgb1 <- xgb.train(params = params, data = dtrain, nrounds = 46, watchlist = list(val=dtest,train=dtrain), print_every_n = 10, 
                   maximize = F , eval_metric = "error")

xgbpred <- predict(xgb1, dtest, type = "response")
xgbpred <- ifelse(xgbpred > 0.77,1,0)

confusionMatrix(xgbpred, ts_label)

但是如果我想根据获得的模型预测整个数据集(35446行和38列),我得到了:

Confusion Matrix and Statistics

          Reference
Prediction    0    1
         0 1569   90
         1  102 1583

               Accuracy : 0.9426             
                 95% CI : (0.9342, 0.9502)   
    No Information Rate : 0.5003             
    P-Value [Acc > NIR] : <0.0000000000000002

                  Kappa : 0.8852             
 Mcnemar's Test P-Value : 0.4273             

            Sensitivity : 0.9390             
            Specificity : 0.9462             
         Pos Pred Value : 0.9458             
         Neg Pred Value : 0.9395             
             Prevalence : 0.4997             
         Detection Rate : 0.4692             
   Detection Prevalence : 0.4961             
      Balanced Accuracy : 0.9426             

       'Positive' Class : 0    

如果模型建立在相同的数据上,为什么会减少错误?

0 个答案:

没有答案