我正在尝试在R中使用xgboost来预测二进制结果。正值“1”仅占整体记录的3%。
我正在使用xgtree,但是,我不知道如何设置参数让模型识别“1”是正值。
你们有什么见解吗?
setDT(train)
setDT(train)
#check missing values
table(is.na(train))
sapply(train, function(x) sum(is.na(x))/length(x))*100
table(is.na(test))
sapply(test, function(x) sum(is.na(x))/length(x))*100
#set all missing value as "Missing"
train[is.na(train)] <- -1
test[is.na(test)] <- -1
#using one hot encoding
labels <- as.factor(train$sd_app_install)
ts_label <- as.factor(test$sd_app_install)
new_tr <- model.matrix(~.+0,data = train[,-c("sd_app_install"),with=F])
new_ts <- model.matrix(~.+0,data = test[,-c("sd_app_install"),with=F])
#convert factor to numeric
labels <- as.numeric(labels)-1
ts_label <- as.numeric(ts_label)-1
#preparing matrix
dtrain <- xgb.DMatrix(data = new_tr,label = labels)
dtest <- xgb.DMatrix(data = new_ts,label=ts_label)
#default parameters
params <- list(
booster = "gbtree",
objective = "binary:logistic",
eta=0.1,
gamma=6,
max_depth=10,
min_child_weight=3.78,
subsample=0.69,
colsample_bytree= 0.933,
scale_pos_weight = 32.3
)
xgbcv <- xgb.cv(params = params
,data = dtrain
,nrounds = 300
,nfold = 5
,showsd = T
,stratified = T
,print_every_n = 10
,early_stopping_rounds = 20
,maximize = F
)
##best iteration = 79
#first default - model training
xgb1 <- xgb.train(
params = params
,data = dtrain
,nrounds = 300
,watchlist = list(val=dtest,train=dtrain)
,print_every_n = 10
,early_stop_round = 10
,maximize = F
,eval_metric = "auc"
)
谢谢, 拉尼娅