Question

我为连续结果变量构建了一个随机森林。

使用randomForest函数的默认设置，我得到的火车毫秒为0,014，测试毫秒为0,079。我想我有一个过拟合的问题。因此，我尝试通过网格搜索和最低的自定义错误来调整参数nodesize mtry和netree。我有14个解释变量。您可以在下面看到的grid参数：

hyper_grid_rf <- expand.grid(
   nodesize = seq(10, 20, 2),
   mtry = seq(2, 6, 1),
   ntree = seq(200,600,100)
 )

之后，我得到了0.039的火车mse和0,077的测试mse。那对我来说仍然如此，并且过分地解决了我的错。

这是我的完整代码：

# # Tuning the Forest using Grid Search
# # Set Seed
 set.seed(1)
# # Defining Hyperparameters
 hyper_grid_rf <- expand.grid(
   nodesize = seq(10, 20, 2),
   mtry = seq(2, 6, 1),
   ntree = seq(200,600,100)
 )




# 
# OutOfBag Error
 oob_err <- c()
# # Write a loop over the rows of hyper_grid to train the grid of models
 for (i in 1: nrow(hyper_grid_rf)){
   # get minsplit, maxdepth values at row i
   nodesize <- hyper_grid_rf$nodesize[i]
   mtry <- hyper_grid_rf$mtry[i]
   ntree <- hyper_grid_rf$ntree[i]
   print(i)
#   # train a model and store in the list

 set.seed(1)

   model <- randomForest(LogStundenlohn ~ Geschlecht+ Bundesland + ArbeitsmarkterfahrungVollzeit +
                           I((ArbeitsmarkterfahrungTeilzeit^2)/100)+ ArbeitsmarkterfahrungTeilzeit +
                           I((ArbeitsmarkterfahrungVollzeit^2)/100) + DauerUnternehmenszugehörigkeit +
                           I((DauerUnternehmenszugehörigkeit^2)/100) + inverseMillsRatioPrunedCtreeInfoUs + öffenticherDienst +
                           Branche + Unternehmengroesse + Migrationshintergrund + Bildungsgrad
                         , data = subset(trainSet2015,Partizipation == 1),
                         nodesize = nodesize, mtry = mtry, ntree = ntree )
   oob_err[i] <- model$mse[max(ntree)]
 }
# 

# # Identify optimal set of hyperparmeters based on OOB error
 opt_i <- which.min(oob_err)
 print(hyper_grid_rf[opt_i,])
 bestParams_rf <- hyper_grid_rf[opt_i,]
# # Use bestParams for Forest

 set.seed(1)

 rf_grid <- randomForest( LogStundenlohn ~ Geschlecht+ Bundesland + ArbeitsmarkterfahrungVollzeit +
                            I((ArbeitsmarkterfahrungTeilzeit^2)/100)+ ArbeitsmarkterfahrungTeilzeit +
                            I((ArbeitsmarkterfahrungVollzeit^2)/100) + DauerUnternehmenszugehörigkeit +
                            I((DauerUnternehmenszugehörigkeit^2)/100) + inverseMillsRatioPrunedCtreeInfoUs + öffenticherDienst +
                            Branche + Unternehmengroesse + Migrationshintergrund + Bildungsgrad
                          , data = subset(trainSet2015,Partizipation == 1), nodesize = bestParams_rf[,1],
                         mtry = bestParams_rf[,2], ntree = bestParams_rf[,3],
                         importance = TRUE)
# 
 pred_t_ranFor_grid <- predict(rf_grid, newdata = subset(trainSet2015,Partizipation ==1))
 pred_v_ranFor_grid <- predict(rf_grid, newdata = subset(testSet2015,Partizipation ==1))
# # MSE
 rf_mse_tr <- round(MSE(pred_t_ranFor_grid, subset(trainSet2015,Partizipation ==1)$LogStundenlohn),3)
 rf_mse_test <- round(MSE(pred_v_ranFor_grid, subset(testSet2015,Partizipation ==1)$LogStundenlohn),3)

通过网格搜索进行调整后，为什么随机森林仍然过拟合？

0 个答案: