我尝试使用rfe
函数执行递归功能消除,但我在尝试更改性能指标以输出ROC时遇到了一些麻烦:
newFunc <- caretFuncs
newFunc$summary <- twoClassSummary
ctrl <- rfeControl(functions = newFunc,
method = 'cv',
returnResamp = TRUE,
number = 2,
verbose = TRUE)
profiler <- rfe(predictors, response,
sizes = c(1),
method = 'nnet',
tuneGrid = expand.grid(size=c(4), decay=c(0.1)),
maxit = 20,
metric = 'ROC',
rfeControl = ctrl)
尝试运行此代码会给我以下错误:
{:任务1失败 - &#34;选择了未定义的列&#34;
如果我删除自定义newFunc
,请在functions
内设置rfeControl
参数以使用caretFuncs
并从metric
中删除rfe
参数,该模型工作正常。这让我觉得摘要有问题。
caretFuncs $ summary:
function (data, lev = NULL, model = NULL)
{
if (is.character(data$obs))
data$obs <- factor(data$obs, levels = lev)
postResample(data[, "pred"], data[, "obs"])
}
twoClassSummary
function (data, lev = NULL, model = NULL)
{
lvls <- levels(data$obs)
if (length(lvls) > 2)
stop(paste("Your outcome has", length(lvls), "levels. The twoClassSummary() function isn't appropriate."))
requireNamespaceQuietStop("ModelMetrics")
if (!all(levels(data[, "pred"]) == lvls))
stop("levels of observed and predicted data do not match")
data$y = as.numeric(data$obs == lvls[2])
rocAUC <- ModelMetrics::auc(ifelse(data$obs == lev[2], 0,
1), data[, lvls[1]])
out <- c(rocAUC, sensitivity(data[, "pred"], data[, "obs"],
lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2]))
names(out) <- c("ROC", "Sens", "Spec")
out
}
postResample
和twoClassSummary
的输出结构相同,所以我对这个问题有点遗失。我在这里做了一些天生错误的事情,还是这个我需要向开发者标记的错误?
我实际上对获取logLoss
感兴趣所以我可以编写自己的函数:
logLoss = function(data, lev = NULL, model = NULL) {
-1*mean(log(data[, 'pred'][model.matrix(~ as.numeric(data[, 'obs'], levels = lev) + 0) - data[, 'pred'] > 0]))
}
但是,我有点不确定如何将因子级别从我的[0,1]
因子转换为正确的[yes, no]
?
答案 0 :(得分:1)
首先,这是一个与插入符号一起使用的可行的logloss函数:
LogLoss <- function (data, lev = NULL, model = NULL)
{
obs <- data[, "obs"]
cls <- levels(obs) #find class names
probs <- data[, cls[2]] #use second class name
probs <- pmax(pmin(as.numeric(probs), 1 - 1e-15), 1e-15) #bound probability
logPreds <- log(probs)
log1Preds <- log(1 - probs)
real <- (as.numeric(data$obs) - 1)
out <- c(mean(real * logPreds + (1 - real) * log1Preds)) * -1
names(out) <- c("LogLoss")
out
}
回答如何将因子水平从我的[yes,no]因子转换为正确的[0,1]的问题:
real <- (as.numeric(data$obs) - 1)
要让rfe
工作,您可以使用rfFuncs
代替caretFuncs
。例如:
rfFuncs$summary <- twoClassSummary
ctrl <- rfeControl(functions = rfFuncs,
method = 'cv',
returnResamp = TRUE,
number = 2,
verbose = TRUE)
profiler <- rfe(Sonar[,1:60], Sonar$Class,
sizes = c(1, 5, 20, 40, 60),
method = 'nnet',
tuneGrid = expand.grid(size=c(4), decay=c(0.1)),
maxit = 20,
metric = 'ROC',
rfeControl = ctrl)
profiler$results
Variables ROC Sens Spec ROCSD SensSD SpecSD
1 1 0.6460027 0.6387987 0.5155187 0.08735968 0.132008571 0.007516016
2 5 0.7563971 0.6847403 0.7013180 0.03751483 0.008724045 0.039383924
3 20 0.8633511 0.8462662 0.7017432 0.08460677 0.091143309 0.097708207
4 40 0.8841540 0.8642857 0.7429847 0.08096697 0.090913729 0.098309489
5 60 0.8945351 0.9004870 0.7431973 0.05707867 0.064971175 0.127471631
或我提供的LogLoss功能:
rfFuncs$summary <- LogLoss
ctrl <- rfeControl(functions = rfFuncs,
method = 'cv',
returnResamp = TRUE,
number = 2,
verbose = TRUE)
profiler <- rfe(Sonar[,1:60], Sonar$Class,
sizes = c(1, 5, 20, 40, 60),
method = 'nnet',
tuneGrid = expand.grid(size=c(4), decay=c(0.1)),
maxit = 20,
metric = 'LogLoss',
rfeControl = ctrl)
profiler$results
Variables LogLoss LogLossSD
1 1 1.8237372 1.030120134
2 5 0.5548774 0.128704686
3 20 0.4226522 0.021547998
4 40 0.4167819 0.013587892
5 60 0.4328718 0.008000892
答案 1 :(得分:1)
但是您应该使logLoss最小化,因此使用此代码(例如逻辑回归https://www.kaggle.com/demetrypascal/rfe-logreg-with-pca-and-feature-importance的示例):
LogLoss <- function (data, lev = NULL, model = NULL)
{
obs <- data[, "obs"]
cls <- levels(obs) #find class names
probs <- data[, cls[2]] #use second class name
probs <- pmax(pmin(as.numeric(probs), 1 - 1e-15), 1e-15) #bound probability
logPreds <- log(probs)
log1Preds <- log(1 - probs)
real <- (as.numeric(data$obs) - 1)
out <- c(mean(real * logPreds + (1 - real) * log1Preds)) * -1
names(out) <- c("LogLossNegative")
-out
}
lrFuncs$summary <- LogLoss
rfec = rfeControl(method = "cv",
number = 2,
functions = lrFuncs)