我想使用单独的逻辑回归来模仿多项logit模型,并通过插入符号交叉验证它们。在非CV世界中,我希望实现以下目标:
# Create Data-Set
library(data.table)
library(dplyr)
N <- 1000
X1 <- rnorm(N, 175, 7)
X2 <- rnorm(N, 30, 8)
X3 <- rnorm(N,0,1)
length <- sample(0:5,N,T)
Ycont <- 0.5*X1 - 0.3*X2 +0.01*X3 + 10 + rnorm(N, 0, 6)
# create 3 categories
Ycateg <- ntile(Ycont,3)
df <- data.frame(id=1:N,length,X1, X2,X3, Ycateg)
df=setDT(df)[,.SD[rep(1L,length)],by = id]
df=df[ , time := 1:.N , by=id]
df=df[,-c("length")]
df$Ycateg=ifelse(df$Ycateg==1,"type1",ifelse(df$Ycateg==2,"type2","type0"))
head(df)
# aim of the model without CV - combine logit regressions
y_1=ifelse(df$Ycateg=="type1",1,0)
y_2=ifelse(df$Ycateg=="type2",1,0)
#drop the ID column
dat_model=df[,-1]
# fit the models
fit_1=glm(formula=y_1~. ,family=binomial(link='logit'),data=dat_model,control = list(maxit = 50))
fit_2=glm(formula=y_2~. ,family=binomial(link='logit'),data=dat_model,control = list(maxit = 50))
# predict
p_1=predict(fit_1,type = "response")
p_2=predict(fit_2,type = "response")
p_0=1-p_1-p_2
head(cbind(p_0,p_1,p_2))
p_0 p_1 p_2
1 1.000000e+00 2.220446e-16 2.220446e-16
2 0.000000e+00 2.220446e-16 1.000000e+00
3 4.930381e-32 1.000000e+00 2.220446e-16
4 4.930381e-32 1.000000e+00 2.220446e-16
所以我需要的是在插入符号框架中构建我自己的模型来复制上面的模型。到目前为止我所做的是:
#Extend Caret
customLogit <- list(type = "Classification", library = "stats", loop = NULL)
customLogit$parameters =data.frame(parameter = c("decay"), class = c("numeric"), label = c("decay"))
customLogit$grid = function(x, y, len = NULL, search = "grid") { }
customLogit$fit <- function(x,y, ...) {
y_1=ifelse(df$Ycateg=="type1",1,0)
y_2=ifelse(df$Ycateg=="type2",1,0)
fit_1=glm(formula=y_1~. ,family=binomial(link='logit'),control = list(maxit = 50),...)
fit_2=glm(formula=y_2~. ,family=binomial(link='logit'),control = list(maxit = 50),...)
out = vector("list",2)
out[[1]]=fit_1
out[[2]]=fit_2
return(out)
}
customLogit$predict <- function(modelFit_all, newdata, preProc = NULL, submodels = NULL,...) {
p_1=predict(modelFit_all[[1]],newdata=newdata,...)
p_2=predict(modelFit_all[[2]],newdata=newdata,...)
p_0=ifelse(p_1==0 & p_2==0,1,0)
out=cbind(p_0,p_1,p_2)
return(out)
}
customLogit$prob <- function(modelFit_all, newdata, preProc = NULL, submodels = NULL) {
p_1=predict(modelFit_all[[1]],newdata=newdata,type="response",...)
p_2=predict(modelFit_all[[2]],newdata=newdata,type="response",...)
p_0=1- p_1-p_0
out=cbind(p_cur,p_def,p_pre)
return(out)
}
customLogit$sort <- NULL
customLogit$levels <- function(x) x$classes
# which type of cross validation to do
fitControl <- trainControl(method = 'cv',number=5,classProbs=TRUE,summaryFunction=defaultSummary, selectionFunction = "best", savePredictions = TRUE)
# tuning parameters
grid <- expand.grid(decay = 0 )
cv=train(as.factor(Ycateg)~.,
data = dat_model,
method = customLogit,
trControl = fitControl,
tuneGrid = grid,
)
Sadely,我无法使代码工作,它给我带来了错误:
Error in train.default(x, y, weights = w, ...) :
argument is missing, with no default
我认为问题是decay
参数,但据我所知,人们无法使用glm“调整”逻辑回归模型,因此我不想引入任何“调整”参数。 / p>
非常感谢提前!