我正在对具有两个预测变量(性别,政治倾向:二进制,连续)的二进制DV进行逻辑回归。我需要帮助以使我的GLM在交叉验证中运行!尽管多次将变量重新分类,但我的代码无法运行。我不确定发生了什么。
这是我的代码:
`
#######################################################
# Cross-Validation of the Logistic Regression
#######################################################
gen <- as.numeric(choicelife.data$gender)
lnc <- as.numeric(choicelife.data$lc)
procprol <-as.numeric(choicelife.data$views)
# This code could be useful
nCV <- 50
MSE_1 <- numeric(nCV)
MSE_2 <- numeric(nCV)
folds <- cut(sample(n),breaks=nCV,labels=FALSE)
#Perform n.folds fold cross validation
i <- 1
for(i in 1:nCV){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- choicelife.data[testIndexes, ]
trainData <- choicelife.data[-testIndexes, ]
# Models
mod1<- glm(views ~ gen,
family=binomial(link=logit), data=trainData)
mod2<- glm(views ~ gen + lnc,
family=binomial(link=logit), data=trainData)
# Get predictions
pred_1 <- predict(mod1, newdata = testData)
pred_2 <- predict(mod2, newdata = testData)
# Calculate MSE
MSE_1[i] <- mean((testData$views - pred_1)^2)
MSE_2[i] <- mean((testData$views - pred_2)^2)
}
warnings()
# mean MSEs
mean(MSE_1)
mean(MSE_2)
# get differences
diffs <- MSE_1 - MSE_2
# get 95% CIs
meandiff <- mean(diffs)
sddiff <- sd(diffs)
c(meandiff-2*sddiff, meandiff+2*sddiff) # 95% Confidence interval (n, n)
答案 0 :(得分:0)
您将某些变量转换为数字,但未将其放在data.frame中。在通过nCV进行迭代的过程中,子集数据帧不包含数字变量,因此将不起作用。
首先,我模拟一些看起来像您的数据框的选择寿命:
choicelife.data = data.frame(
lc=sample(1:10,100,replace=TRUE),
gender=sample(c("M","F"),100,replace=TRUE),
views = sample(c("Pro","Against"),100,replace=TRUE)
)
有关建议的编辑,请参见下文
choicelife.data$gen <- as.numeric(choicelife.data$gender)
choicelife.data$lnc <- as.numeric(choicelife.data$lc)
# make this 0 or 1
choicelife.data$procprol <-as.numeric(choicelife.data$views)-1
# This code could be useful
nCV <- 5
MSE_1 <- numeric(nCV)
MSE_2 <- numeric(nCV)
folds <- cut(sample(1:nrow(choicelife.data)),breaks=nCV,labels=FALSE)
for(i in 1:nCV){
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- choicelife.data[testIndexes, ]
trainData <- choicelife.data[-testIndexes, ]
# Models
mod1<- glm(procprol ~ gen,
family=binomial(link=logit), data=trainData)
mod2<- glm(procprol ~ gen + lnc,
family=binomial(link=logit), data=trainData)
# Get predictions
pred_1 <- predict(mod1, newdata = testData,type="response")
pred_2 <- predict(mod2, newdata = testData,type="response")
# Calculate MSE
MSE_1[i] <- mean((testData$procprol - pred_1)^2)
MSE_2[i] <- mean((testData$procprol - pred_2)^2)
}