我使用glmnetUtils在11个alpha值上重复执行CV。我的编码能力有限,所以我分别进行了 N 次重复,每次都固定褶皱:
alpha <- seq(.5, 1, .05)
set.seed(1)
folds1 <- sample(1:10, size = length(df$id[1:2300]), replace = TRUE)
set.seed(2)
folds2 <- sample(1:10, size = length(df$id[1:2300]), replace = TRUE)
# Run glmnet
enet.1 <- cva.glmnet(x, y,
foldid = folds1,
alpha = alpha,
family = "binomial")
enet.2 <- cva.glmnet(x, y,
foldid = folds2,
alpha = alpha,
family = "binomial")
# Determine optimal combination of alpha and lambda; extract lowest CV error and associated lambda at each alpha
extractGlmnetInfo <- function(object)
{
# Find lambdas
lambda.min <- object$lambda.min
lambda.1se <- object$lambda.1se
# Determine where lambdas fall in path
which.min <- which(object$lambda == lambda.min)
which.1se <- which(object$lambda == lambda.1se)
# Create data frame with selected lambdas and corresponding error
data.frame(lambda.min = lambda.min, cv.min = object$cvm[which.min],
lambda.1se = lambda.1se, cv.1se = object$cvm[which.1se])
}
# Extract smallest CV error and lambda at each alpha for each iteration of 10-fold CV
# Calculate means (across iterations) of lowest CV error and associated lambdas for each alpha
enet.iter.1 <- ldply(enet.1$modlist, extractGlmnetInfo)
enet.iter.2 <- ldply(enet.2$modlist, extractGlmnetInfo)
enet.iter <- bind_rows(enet.iter.1, enet.iter.2)
enet.iter <- data.frame(cbind(alpha, enet.iter))
lambda.min <- aggregate(lambda.min ~ alpha, enet.iter, mean)
cv.min <- aggregate(cv.min ~ alpha, enet.iter, mean)
lambda.1se <- aggregate(lambda.1se ~ alpha, enet.iter, mean)
cv.1se <- aggregate(cv.1se ~ alpha, enet.iter, mean)
# Create data frame with means of CV error and lambda at each value of alpha
enet.means <- data.frame(lambda.min, cv.min$cv.min, lambda.1se, cv.1se$cv.1se)
enet.means$alpha.1 <- NULL
colnames(enet.means)[2] <- "lambda.min"
colnames(enet.means)[3] <- "cv.min"
colnames(enet.means)[4] <- "lambda.1se"
colnames(enet.means)[5] <- "cv.1se"
# Extract optimal values of alpha and lambda
enet.error.min <- min(enet.means$cv.min)
enet.lambda.min <- enet.means$lambda.min[which(enet.means$cv.min == min(enet.means$cv.min))]
enet.alpha.min <- enet.means$alpha[which(enet.means$cv.min == min(enet.means$cv.min))]
enet.error.1se <- min(enet.means$cv.1se)
enet.lambda.1se <- enet.means$lambda.1se[which(enet.means$cv.1se == min(enet.means$cv.1se))]
enet.alpha.1se <- enet.means$alpha[which(enet.means$cv.1se == min(enet.means$cv.1se))]
enet.optimal <- list(enet.error.min, enet.lambda.min, enet.alpha.min,
enet.error.1se, enet.lambda.1se, enet.alpha.1se)
names(enet.optimal) <- c("error.min", "lambda.min", "alpha.min", "error.1se", "lambda.1se", "alpha.1se")
从这里,可以从每个模型中提取预测的概率并将其平均以用于其他模型;但是,当所选变量可能因模型而异时,我不清楚如何提取系数并求平均值。
问题
是否可以使用glmnet(Utils)提取系数并进行平均,还是最好使用alpha和lambda(以及唯一折叠)的最佳组合来拟合新模型并使用该模型生成的系数?