使用glmnet(Utils)重复CV后的平均系数

时间:2018-08-10 02:09:24

标签: r glmnet

我使用glmnetUtils在11个alpha值上重复执行CV。我的编码能力有限,所以我分别进行了 N 次重复,每次都固定褶皱:

alpha <- seq(.5, 1, .05)

set.seed(1)
folds1 <- sample(1:10, size = length(df$id[1:2300]), replace = TRUE)

set.seed(2)
folds2 <- sample(1:10, size = length(df$id[1:2300]), replace = TRUE)

# Run glmnet

enet.1 <- cva.glmnet(x, y,
                     foldid = folds1,
                     alpha = alpha,
                     family = "binomial")

enet.2 <- cva.glmnet(x, y,
                     foldid = folds2,
                     alpha = alpha,
                     family = "binomial")


# Determine optimal combination of alpha and lambda; extract lowest CV error and associated lambda at each alpha

extractGlmnetInfo <- function(object)
{
  # Find lambdas
  lambda.min <- object$lambda.min
  lambda.1se <- object$lambda.1se

  # Determine where lambdas fall in path
  which.min <- which(object$lambda == lambda.min)
  which.1se <- which(object$lambda == lambda.1se)

  # Create data frame with selected lambdas and corresponding error
  data.frame(lambda.min = lambda.min, cv.min = object$cvm[which.min],
             lambda.1se = lambda.1se, cv.1se = object$cvm[which.1se])
}

# Extract smallest CV error and lambda at each alpha for each iteration of 10-fold CV
# Calculate means (across iterations) of lowest CV error and associated lambdas for each alpha

enet.iter.1 <- ldply(enet.1$modlist, extractGlmnetInfo)
enet.iter.2 <- ldply(enet.2$modlist, extractGlmnetInfo)

enet.iter <- bind_rows(enet.iter.1, enet.iter.2)

enet.iter <- data.frame(cbind(alpha, enet.iter))

lambda.min <- aggregate(lambda.min ~ alpha, enet.iter, mean)
cv.min <- aggregate(cv.min ~ alpha, enet.iter, mean)
lambda.1se <- aggregate(lambda.1se ~ alpha, enet.iter, mean)
cv.1se <- aggregate(cv.1se ~ alpha, enet.iter, mean)

# Create data frame with means of CV error and lambda at each value of alpha

enet.means <- data.frame(lambda.min, cv.min$cv.min, lambda.1se, cv.1se$cv.1se)
enet.means$alpha.1 <- NULL
colnames(enet.means)[2] <- "lambda.min"
colnames(enet.means)[3] <- "cv.min"
colnames(enet.means)[4] <- "lambda.1se"
colnames(enet.means)[5] <- "cv.1se"

# Extract optimal values of alpha and lambda

enet.error.min <- min(enet.means$cv.min)
enet.lambda.min <- enet.means$lambda.min[which(enet.means$cv.min == min(enet.means$cv.min))]
enet.alpha.min <- enet.means$alpha[which(enet.means$cv.min == min(enet.means$cv.min))]

enet.error.1se <- min(enet.means$cv.1se)
enet.lambda.1se <- enet.means$lambda.1se[which(enet.means$cv.1se == min(enet.means$cv.1se))]
enet.alpha.1se <- enet.means$alpha[which(enet.means$cv.1se == min(enet.means$cv.1se))]

enet.optimal <- list(enet.error.min, enet.lambda.min, enet.alpha.min,
                     enet.error.1se, enet.lambda.1se, enet.alpha.1se)

names(enet.optimal) <- c("error.min", "lambda.min", "alpha.min", "error.1se", "lambda.1se", "alpha.1se")

从这里,可以从每个模型中提取预测的概率并将其平均以用于其他模型;但是,当所选变量可能因模型而异时,我不清楚如何提取系数并求平均值。

问题

是否可以使用glmnet(Utils)提取系数并进行平均,还是最好使用alpha和lambda(以及唯一折叠)的最佳组合来拟合新模型并使用该模型生成的系数?

0 个答案:

没有答案