Question

将ROC作为metric参数值传递给caretSBF函数

我们的目标是在运行通过筛选选择sbf()功能进行功能选择时，使用ROC摘要指标进行模型选择。

BreastCancer数据集用作mlbench包中可重现的示例，以train()和sbf()运行metric = "Accuracy"和metric = "ROC"

我们希望sbf()采用metric和train()函数应用的rfe()参数来优化模型。为此，我们计划使用train()函数和sbf()函数。 caretSBF$fit功能调用train()，caretSBF传递给sbfControl。

从输出中，似乎metric参数仅用于inner resampling而不是sbf部分，即输出的outer resampling，{ {1}}和metric使用的{1}}参数未应用。

由于我们使用了使用train()的{{1}}，因此rfe()参数的范围似乎仅限于caretSBF，因此不会传递给train() }。

我们希望澄清metric是否使用train()参数来优化模型，即sbf？

以下是我们关于可重复示例的工作，显示sbf()使用metric和outer resampling使用train()参数，但metric我们不确定。< / p>

予。数据部分

Accuracy

II。自定义汇总功能

定义fiveStats摘要功能

ROC

III。火车部分

定义trControl

sbf

列车+公制=＆＃34;准确度＆＃34;

  ## Loading required packages   
  library(mlbench)
  library(caret)

  ## Loading `BreastCancer` Dataset from *mlbench* package   
  data("BreastCancer")

  ## Data cleaning for missing values
  # Remove rows/observation with NA Values in any of the columns
  BrC1 <- BreastCancer[complete.cases(BreastCancer),] 

  # Removing Class and Id Column and keeping just Numeric Predictors
  Num_Pred <- BrC1[,2:10]

列车+公制=＆＃34; ROC＆＃34;

  fiveStats <- function(...) c(twoClassSummary(...),
                         defaultSummary(...))

IV。编辑caretSBF

编辑caretSBF摘要功能

  trCtrl <- trainControl(method="repeatedcv", number=10,
  repeats=1, classProbs = TRUE, summaryFunction = fiveStats)

诉SBF SECTION

定义sbfControl

   set.seed(1)
   TR_acc <- train(Num_Pred,BrC1$Class, method="rf",metric="Accuracy",
   trControl = trCtrl,tuneGrid=expand.grid(.mtry=c(2,3,4,5)))

   TR_acc
   # Random Forest 
   # 
   # 683 samples
   #   9 predictor
   #   2 classes: 'benign', 'malignant' 
   # 
   # No pre-processing
   # Resampling: Cross-Validated (10 fold, repeated 1 times) 
   # Summary of sample sizes: 615, 615, 614, 614, 614, 615, ... 
   # Resampling results across tuning parameters:
   # 
   #   mtry  ROC        Sens       Spec       Accuracy   Kappa    
   #   2     0.9936532  0.9729798  0.9833333  0.9765772  0.9490311
   #   3     0.9936544  0.9729293  0.9791667  0.9750853  0.9457534
   #   4     0.9929957  0.9684343  0.9750000  0.9706948  0.9361373
   #   5     0.9922907  0.9684343  0.9666667  0.9677536  0.9295782
   # 
   # Accuracy was used to select the optimal model using  the largest value.
   # The final value used for the model was mtry = 2.

SBF + METRIC =＆＃34;准确度＆＃34;

   set.seed(1)
   TR_roc <- train(Num_Pred,BrC1$Class, method="rf",metric="ROC",
   trControl = trCtrl,tuneGrid=expand.grid(.mtry=c(2,3,4,5)))
   TR_roc
   # Random Forest 
   # 
   # 683 samples
   #   9 predictor
   #   2 classes: 'benign', 'malignant' 
   # 
   # No pre-processing
   # Resampling: Cross-Validated (10 fold, repeated 1 times) 
   # Summary of sample sizes: 615, 615, 614, 614, 614, 615, ... 
   # Resampling results across tuning parameters:
   # 
   #   mtry  ROC        Sens       Spec       Accuracy   Kappa    
   #   2     0.9936532  0.9729798  0.9833333  0.9765772  0.9490311
   #   3     0.9936544  0.9729293  0.9791667  0.9750853  0.9457534
   #   4     0.9929957  0.9684343  0.9750000  0.9706948  0.9361373
   #   5     0.9922907  0.9684343  0.9666667  0.9677536  0.9295782
   # 
   # ROC was used to select the optimal model using  the largest value.
   # The final value used for the model was mtry = 3.

SBF + METRIC =＆＃34; ROC＆＃34;

   caretSBF$summary <- fiveStats

sbfCtrl <- sbfControl(functions=caretSBF, method="repeatedcv", number=10, repeats=1, verbose=T, saveDetails = T)使用set.seed(1) sbf_acc <- sbf(Num_Pred, BrC1$Class, sbfControl = sbfCtrl, trControl = trCtrl, method="rf", metric="Accuracy") ## sbf_acc sbf_acc # Selection By Filter # # Outer resampling method: Cross-Validated (10 fold, repeated 1 times) # # Resampling performance: # # ROC Sens Spec Accuracy Kappa ROCSD SensSD SpecSD AccuracySD KappaSD # 0.9931 0.973 0.9833 0.9766 0.949 0.006272 0.0231 0.02913 0.01226 0.02646 # # Using the training set, 9 variables were selected: # Cl.thickness, Cell.size, Cell.shape, Marg.adhesion, Epith.c.size... # # During resampling, the top 5 selected variables (out of a possible 9): # Bare.nuclei (100%), Bl.cromatin (100%), Cell.shape (100%), Cell.size (100%), Cl.thickness (100%) # # On average, 9 variables were selected (min = 9, max = 9) ## Class of sbf_acc class(sbf_acc) # [1] "sbf" ## Names of elements of sbf_acc names(sbf_acc) # [1] "pred" "variables" "results" "fit" "optVariables" # [6] "call" "control" "resample" "metrics" "times" # [11] "resampledCM" "obsLevels" "dots" ## sbf_acc fit element* sbf_acc$fit # Random Forest # # 683 samples # 9 predictor # 2 classes: 'benign', 'malignant' # # No pre-processing # Resampling: Cross-Validated (10 fold, repeated 1 times) # Summary of sample sizes: 615, 614, 614, 615, 615, 615, ... # Resampling results across tuning parameters: # # mtry ROC Sens Spec Accuracy Kappa # 2 0.9933176 0.9706566 0.9833333 0.9751492 0.9460717 # 5 0.9920034 0.9662121 0.9791667 0.9707801 0.9363708 # 9 0.9914825 0.9684343 0.9708333 0.9693308 0.9327662 # # Accuracy was used to select the optimal model using the largest value. # The final value used for the model was mtry = 2. ## Elements of sbf_acc fit names(sbf_acc$fit) # [1] "method" "modelInfo" "modelType" "results" "pred" # [6] "bestTune" "call" "dots" "metric" "control" # [11] "finalModel" "preProcess" "trainingData" "resample" "resampledCM" # [16] "perfNames" "maximize" "yLimits" "times" "levels" ## sbf_acc fit final Model sbf_acc$fit$finalModel # Call: # randomForest(x = x, y = y, mtry = param$mtry) # Type of random forest: classification # Number of trees: 500 # No. of variables tried at each split: 2 # # OOB estimate of error rate: 2.34% # Confusion matrix: # benign malignant class.error # benign 431 13 0.02927928 # malignant 3 236 0.01255230 ## sbf_acc metric sbf_acc$fit$metric # [1] "Accuracy" ## sbf_acc fit best Tune* sbf_acc$fit$bestTune # mtry # 1 2参数来优化模型吗？如果是，set.seed(1) sbf_roc <- sbf(Num_Pred, BrC1$Class, sbfControl = sbfCtrl, trControl = trCtrl, method="rf", metric="ROC") ## sbf_roc sbf_roc # Selection By Filter # # Outer resampling method: Cross-Validated (10 fold, repeated 1 times) # # Resampling performance: # # ROC Sens Spec Accuracy Kappa ROCSD SensSD SpecSD AccuracySD KappaSD # 0.9931 0.973 0.9833 0.9766 0.949 0.006272 0.0231 0.02913 0.01226 0.02646 # # Using the training set, 9 variables were selected: # Cl.thickness, Cell.size, Cell.shape, Marg.adhesion, Epith.c.size... # # During resampling, the top 5 selected variables (out of a possible 9): # Bare.nuclei (100%), Bl.cromatin (100%), Cell.shape (100%), Cell.size (100%), Cl.thickness (100%) # # On average, 9 variables were selected (min = 9, max = 9) ## Class of sbf_roc class(sbf_roc) # [1] "sbf" ## Names of elements of sbf_roc names(sbf_roc) # [1] "pred" "variables" "results" "fit" "optVariables" # [6] "call" "control" "resample" "metrics" "times" # [11] "resampledCM" "obsLevels" "dots" ## sbf_roc fit element* sbf_roc$fit # Random Forest # # 683 samples # 9 predictor # 2 classes: 'benign', 'malignant' # # No pre-processing # Resampling: Cross-Validated (10 fold, repeated 1 times) # Summary of sample sizes: 615, 614, 614, 615, 615, 615, ... # Resampling results across tuning parameters: # # mtry ROC Sens Spec Accuracy Kappa # 2 0.9933176 0.9706566 0.9833333 0.9751492 0.9460717 # 5 0.9920034 0.9662121 0.9791667 0.9707801 0.9363708 # 9 0.9914825 0.9684343 0.9708333 0.9693308 0.9327662 # # ROC was used to select the optimal model using the largest value. # The final value used for the model was mtry = 2. ## Elements of sbf_roc fit names(sbf_roc$fit) # [1] "method" "modelInfo" "modelType" "results" "pred" # [6] "bestTune" "call" "dots" "metric" "control" # [11] "finalModel" "preProcess" "trainingData" "resample" "resampledCM" # [16] "perfNames" "maximize" "yLimits" "times" "levels" ## sbf_roc fit final Model sbf_roc$fit$finalModel # Call: # randomForest(x = x, y = y, mtry = param$mtry) # Type of random forest: classification # Number of trees: 500 # No. of variables tried at each split: 2 # # OOB estimate of error rate: 2.34% # Confusion matrix: # benign malignant class.error # benign 431 13 0.02927928 # malignant 3 236 0.01255230 ## sbf_roc metric sbf_roc$fit$metric # [1] "ROC" ## sbf_roc fit best Tune sbf_roc$fit$bestTune # mtry # 1 2 sbf()使用什么作为默认值？如果metric使用metric参数，那么如何将其设置为sbf()？

感谢。

Answer 1

sbf没有使用该指标来优化任何内容（与rfe不同）;所有sbf都会在调用模型之前执行功能选择步骤。当然，您定义了过滤器，但无法使用sbf调整过滤器，因此无需指标来指导该步骤。

使用sbf(x, y, metric = "ROC")会将metric = "ROC"传递给您正在使用的任何建模函数（当使用train时，它会设计为与caretSBF一起使用。这是因为存在metric没有sbf参数：

> names(formals(caret:::sbf.default))
[1] "x"          "y"          "sbfControl" "..."

sbf（）是否使用metric参数来优化模型？

1 个答案: