我有带预测变量的数据集,它有35446行和38列目标。
我制作火车和测试装置:
I would use struct as I need to call the static func:
struct BaseString {
static func createBaseString(authPrefix,signMethod,urlPath,nonce, timestamp,delimeter="&", bool sort= true, bool quote = false) -> String? {
var dict = [String:String]()
let url = NSURL(string: urlPath)
var keyValues = url.query?.componentsSeparatedByString("&")
//-(1)- adding keyValue into Dictinary
dict.??
//-- how to add the data below?
//- after (1) : Add other key value into same Dictionary
dict Add(authPrefix + "_timestamp", timestamp);
dict.Add(authPrefix + "_nonce", nonce);
dict.Add(authPrefix + "_signature_method", signMethod);
dict.Add(authPrefix + "_version", "1.0");
var return_format:String
if quote == true{
//-- create a baseString
sort the Dictionary
return_format = "&" + url + "&" +Dictionary.ToString()
(format: String = "q ="V1" " the value with double quote)
}else{
//-- create a baseString
sort the Dictionary
return_format = Dictionary.ToString()
(format:Strig = " q=v2")
}
var baseString = return_format
return baseString
}
}
之后,我想用PCA减少数据的维数。
data_for_predict <- res
data_good <- data_for_predict%>%filter(target == 1)
data_bad <- data_for_predict%>%filter(target == 0)
set.seed(789)
size_bad <- floor(1 * nrow(data_good))
data_ind <- sample(seq_len(nrow(data_bad)), size = size_bad)
data_bad <- data_bad[data_ind, ]
data_for_predict <- rbind(data_good, data_bad)
data_for_predict <- data_for_predict[sample(1:nrow(data_for_predict)),]
goal <- as.data.frame(data_for_predict$target)
data_for_predict <- data_for_predict%>%select(-target)
然后我将数据分解为训练和测试样本
PCA <- prcomp(data_for_predict, scale. = TRUE)
PCA <- as.data.frame(PCA$x)
data_for_predict <- cbind(data_for_predict, PCA)
data_for_predict <- as.data.frame(data_for_predict)
data_for_predict$target <- target$`data_for_predict$target`
现在我准备培训数据
smp_size <- floor(0.8 * nrow(data_for_predict))
set.seed(123)
train_ind <- sample(seq_len(nrow(data_for_predict)), size = smp_size)
train <- data_for_predict[train_ind, ]
rownames(train) <- seq(length=nrow(train))
test <- data_for_predict[-train_ind, ]
rownames(test) <- seq(length=nrow(test))
names(test) <- make.names(names(test))
names(train) <- make.names(names(train))
适合:
setDT(train)
setDT(test)
labels <- train$target
ts_label <- test$target
new_tr <- model.matrix(~.+0,data = train[,-c("target"),with=F])
new_ts <- model.matrix(~.+0,data = test[,-c("target"),with=F])
dtrain <- xgb.DMatrix(data = new_tr,label = labels)
dtest <- xgb.DMatrix(data = new_ts,label=ts_label)
我得到了一个好结果:
params <- list(booster = "gbtree", objective = "binary:logistic", eta=0.3, gamma=0, max_depth=10, min_child_weight=1, subsample=1, colsample_bytree=1)
xgbcv <- xgb.cv(params = params, data = dtrain, nrounds = 1000, nfold = 5, showsd = T, stratified = T, print_every_n = 10,
early_stopping_round = 20, maximize = F, eval_metric = "error")
xgb1 <- xgb.train(params = params, data = dtrain, nrounds = 46, watchlist = list(val=dtest,train=dtrain), print_every_n = 10,
maximize = F , eval_metric = "error")
xgbpred <- predict(xgb1, dtest, type = "response")
xgbpred <- ifelse(xgbpred > 0.77,1,0)
confusionMatrix(xgbpred, ts_label)
但是如果我想根据获得的模型预测整个数据集(35446行和38列),我得到了:
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 1569 90
1 102 1583
Accuracy : 0.9426
95% CI : (0.9342, 0.9502)
No Information Rate : 0.5003
P-Value [Acc > NIR] : <0.0000000000000002
Kappa : 0.8852
Mcnemar's Test P-Value : 0.4273
Sensitivity : 0.9390
Specificity : 0.9462
Pos Pred Value : 0.9458
Neg Pred Value : 0.9395
Prevalence : 0.4997
Detection Rate : 0.4692
Detection Prevalence : 0.4961
Balanced Accuracy : 0.9426
'Positive' Class : 0
如果模型建立在相同的数据上,为什么会减少错误?