我一直试图使用cv.glmnet
来适应套索模型。我尝试基于标准化实现四个不同的模型(3个使用cv.glmnet
,1个使用caret::train
)。所有四种模型都给出了非常不同的系数估计,我无法弄清楚为什么。
这是一个完全可重现的代码:
library("glmnet")
data(iris)
iris <- iris
dat <- iris[iris$Species %in% c("setosa","versicolor"),]
X <- as.matrix(dat[,1:4])
Y <- as.factor(as.character(dat$Species))
set.seed(123)
model1 <- cv.glmnet(x = X,
y = Y,
family = "binomial",
standardize = FALSE,
alpha = 1,
lambda = rev(seq(0,1,length=100)),
nfolds = 3)
set.seed(123)
model2 <- cv.glmnet(x = scale(X, center = T, scale = T),
y = Y,
family = "binomial",
standardize = FALSE,
alpha = 1,
lambda = rev(seq(0,1,length=100)),
nfolds = 3)
set.seed(123)
model3 <- cv.glmnet(x = X,
y = Y,
family = "binomial",
standardize = TRUE,
alpha = 1,
lambda = rev(seq(0,1,length=100)),
nfolds = 3)
##Using caret
library("caret")
lambda.grid <- rev(seq(0,1,length=100)) #set of lambda values for cross-validation
alpha.grid <- 1 #alpha
trainControl <- trainControl(method ="cv",
number=3) #3-fold cross-validation
tuneGrid <- expand.grid(.alpha=alpha.grid, .lambda=lambda.grid) #these are tuning parameters to be passed into the train function below
set.seed(123)
model4 <- train(x = X,
y = Y,
method="glmnet",
family="binomial",
standardize = FALSE,
trControl = trainControl,
tuneGrid = tuneGrid)
c1 <- coef(model1, s=model1$lambda.min)
c2 <- coef(model2, s=model2$lambda.min)
c3 <- coef(model3, s=model3$lambda.min)
c4 <- coef(model4$finalModel, s=model4$finalModel$lambdaOpt)
c1 <- as.matrix(c1)
c2 <- as.matrix(c2)
c3 <- as.matrix(c3)
c4 <- as.matrix(c4)
model2
预先缩放自变量(向量X
),model3
通过设置standardize = TRUE
来进行缩放。所以至少这两个模型应该返回相同的结果 - 但事实并非如此。
从四个模型中获得的lambda.min是:
model1 = 0
model2 = 0
model3 = 0
model4 = 0.6565657
模型之间的系数估计也有很大差异。为什么会这样?
答案 0 :(得分:0)
实际上scale(x) & standardize = FALSE
和x & standardize = TRUE
之间略有不同。我们需要多个(N-1)/ N.
请参阅here。
如果我们使用高斯族,
library(glmnet)
X <- matrix(runif(100, 0, 1), ncol=2)
y <- 1 -2*X[,1] + X[,2]
enet <- glmnet(X, y, lambda=0.1,standardize = T,family="gaussian")
coefficients(enet)
coef <- coefficients(enet)
coef[2]*sd(X[,1])/sd(y) #standardized coef
#[1] -0.6895065
enet1 <- glmnet(scale(X)/99*100, y/(99/100*sd(y)),lambda=0.1/(99/100*sd(y)),standardize = F,family="gaussian")
coefficients(enet1)[2]
#[1] -0.6894995
如果我们使用二项式家族,
data(iris)
iris <- iris
dat <- iris[iris$Species %in% c("setosa","versicolor"),]
X <- as.matrix(dat[,1:4])
Y <- as.factor(as.character(dat$Species))
set.seed(123)
model1 <- cv.glmnet(x = X,
y = Y,
family = "binomial",
standardize = T,
alpha = 1,
lambda = rev(seq(0,1,length=100)),
nfolds = 3)
coefficients(model1,s=0.03)[3]*sd(X[,2])
#[1] -0.3374946
set.seed(123)
model3 <- cv.glmnet(x = scale(X)/99*100,
y = Y,
family = "binomial",
standardize = F,
alpha = 1,
lambda = rev(seq(0,1,length=100)),
nfolds = 3)
coefficients(model3,s=0.03)[3]
#[1] -0.3355027
这些结果几乎相同。希望这个答案还为时不晚。