Caret中的xgbTree预测模型数据范围之外的所有值

时间:2016-11-18 11:57:30

标签: r regression r-caret

我正在尝试使用xgbTree在Caret中仅使用3个变量训练回归模型。该模型仅预测2个离散值(2.3和2.6),这两个值都在模型数据范围之外(5个值介于2.8和4之间)。

此外,当我尝试计算变量重要性时,我收到以下警告消息

  

在FUN(newX [,i],...)中:max没有非缺失参数;返回-Inf

仅列出1个变量,其重要性为NaN。

有人能够在这里解释我做错了什么吗?我附上了一个简单的示例代码来描述我的意思。

library("caret")
library("xgboost") 
library("plyr")

x_linear<- c(       
3.76, 3.88, 3.55, 3.47, 3.49, 3.44, 3.47, 3.49, 3.44, 3.92, 3.94, 3.55, 
3.61, 3.57, 3.46, 3.72, 3.96, 3.55, 3.34, 3.26, 3.46, 3.69, 3.69, 3.53,
3.85, 3.78, 3.55, 3.42, 3.36, 3.53, 3.98, 3.91, 3.55, 3.95, 3.82, 3.55,
3.07, 3.02, 3.44, 3.45, 3.24, 3.46, 3.83, 3.90, 3.55, 3.89, 3.96, 3.55,
3.84, 3.76, 3.55, 3.78, 3.94, 3.46, 3.28, 3.47, 3.44, 3.66, 3.66, 3.46,
3.81, 3.90, 3.46, 3.66, 3.64, 3.46, 3.70, 3.69, 3.53, 3.89, 3.85, 3.55,
3.89, 3.85, 3.55, 3.39, 3.50, 3.46, 3.59, 3.52, 3.55, 3.42, 3.29, 3.44,
3.28, 3.39, 3.46, 3.23, 3.17, 3.53, 3.57, 3.28, 3.46, 3.61, 3.60, 3.46,
3.08, 3.02, 3.44, 3.55, 3.63, 3.55, 3.60, 3.63, 3.55, 3.26, 3.27, 3.53,
3.26, 3.27, 3.53, 3.54, 3.62, 3.46, 3.54, 3.62, 3.55, 3.29, 3.30, 3.44,
3.60, 3.59, 3.55, 4.00, 3.98, 3.55, 3.25, 3.41, 3.55, 3.59, 3.66, 3.55,
3.47, 3.51, 3.46)
x <- matrix(x_linear, nrow = 45, ncol = 3, byrow = TRUE)
colnames(x) <- c("x1","x2","x3")

y <- c(4.0, 3.7, 3.4, 4.0, 4.0, 4.0, 3.1, 3.4, 4.0, 3.4, 4.0, 4.0, 
2.8, 3.4, 4.0, 4.0, 4.0, 4.0, 3.4, 3.7, 4.0, 3.7, 4.0, 4.0, 4.0,
3.4, 3.7, 3.4, 3.1, 3.1, 2.8, 3.4, 2.8, 3.7, 3.4, 3.1, 3.1, 4.0, 
3.7, 3.1, 3.7, 4.0, 2.8, 3.7, 3.7)

trainControl <- trainControl(summaryFunction=defaultSummary, 
   method="repeatedcv", repeats=3, number=8, selectionFunction="oneSE")
tuneGrid <- expand.grid(.nrounds = 100, .max_depth = 6, .eta = 0.01,
   .gamma = 0, .colsample_bytree = 1, .min_child_weight= 10)

#split into training and test sets
set.seed(1)
nSamples <- length(y)
TrainingIndexes <- createDataPartition(y, p=0.8)[[1]]
TestIndexes <- (seq(1,nSamples))[-TrainingIndexes]

# Preprocess data
procValues <- preProcess(x[TrainingIndexes,], method=c("center", "scale"))
ProcedData <- predict(procValues, x)
ProcedTrainingData <- ProcedData[TrainingIndexes,]

set.seed(1)
fit.xgb <- train(x=ProcedTrainingData, y=y[TrainingIndexes], 
method="xgbTree",tuneGrid=tuneGrid,metric="RMSE", trControl=trainControl)
Imps <- varImp(fit.xgb)
Pred_y <- predict(fit.xgb, ProcedData)
plot(y[TrainingIndexes], Pred_y[TrainingIndexes],col="blue", 
xlab="Meas", ylab="Pred")
points(y[TestIndexes], Pred_y[TestIndexes],col="red")

0 个答案:

没有答案