背景:我正在尝试使用Kaggle的泰坦尼克号灾难数据集来预测生存。
以下代码中使用的数据集train4
。
PassengerId Pclass Sex Age SibSp Parch Fare Cabin Sex_F Survived
1 3 male 22 1 0 7.25 0 0
2 1 female 38 1 0 71.2833 C85 1 1
3 3 female 26 0 0 7.925 1 1
4 1 female 35 1 0 53.1 C123 1 1
5 3 male 35 0 0 8.05 0 0
7 1 male 54 0 0 51.8625 E46 0 0
我试图在3个模型中分别运行n
次glm。我已将这3个模型包含在一个函数run_models
中。显然,函数已成功定义,但是当我在run_models(10)
中输入n时,它不会运行(抛出错误:In predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type == : prediction from a rank-deficient fit may be misleading
)。
但如果我通过将n替换为10来直接运行for(i in 1:n)
部分,它就会运行。如果我可以自动化代码而不在每次出现时手动添加“n”,代码就会变得更加灵活。
这是完整的代码:
library("dplyr")
library("ggplot2")
library("scales")
train<-read.csv("train.csv")
test<-read.csv("test.csv")
attach(train)
#Filtering relevant informations
train2<-train[complete.cases(train),]
train3<-train2[,-c(4,9)]
train3<-train3[,c(1,3:9,2)]
#Dummy variables for Sex
train4<-mutate(train3,Sex_F = (train3$Sex == "female")*(train3$Sex == "female"))
#Massaging final dataset
train4<-train4[,c(1:8,10,9)]
#Fitting logistic regression
fit1<-glm(Survived~., data = train4, family = binomial(link='logit'))
fit2<-glm(Survived~ Pclass + Age + SibSp + Parch + Fare + Sex_F, data = train4, family = binomial(link='logit'))
fit3<-glm(Survived~ Pclass + Sex_F, data = train4, family = binomial(link='logit'))
models_summary<-data.frame(Model = numeric(),Accuracy = numeric(),Deviance = numeric())
run_models<-function(n){
for(i in 1:n)
{
#Making some dataset to check accuracy
#Model1
check1<-sample_n(train4,100)
check_fit1<-data.frame(round(predict(fit1, newdata = data.frame(check1[,-c(10)]),type="response"),0))
colnames(check_fit1)<-c("Survival_predicted")
final1<-cbind(check1[,c(1,10)],check_fit1[,1])
colnames(final1)<-c("Passenger ID","Survived","Survival_predicted")
accuracy1<-1 - nrow(final1[which(final1$Survived!=final1$Survival_predicted),])/nrow(final1)
#Model2
check2<-sample_n(train4[,-c(3,8)],100)
check_fit2<-data.frame(round(predict(fit2, newdata = data.frame(check2[,-c(8)]),type="response"),0))
colnames(check_fit2)<-c("Survival_predicted")
final2<-cbind(check2[,c(1,8)],check_fit2[,1])
colnames(final2)<-c("Passenger ID","Survived","Survival_predicted")
accuracy2<-1 - nrow(final2[which(final2$Survived!=final2$Survival_predicted),])/nrow(final2)
#Model3
check3<-sample_n(train4[,c(1,2,9,10)],100)
check_fit3<-data.frame(round(predict(fit3, newdata = data.frame(check3[,c(1:3)]),type="response"),0))
colnames(check_fit3)<-c("Survival_predicted")
final3<-cbind(check3[,c(1,4)],check_fit3[,1])
colnames(final3)<-c("Passenger ID","Survived","Survival_predicted")
accuracy3<-1 - nrow(final3[which(final3$Survived!=final3$Survival_predicted),])/nrow(final3)
#Summary
models_summary_TEMP<-data.frame(c("1","2","3"),c(accuracy1,accuracy2,accuracy3),c(fit1$deviance,fit2$deviance,fit3$deviance))
colnames(models_summary_TEMP)<-c("Model","Accuracy","Deviance")
models_summary<-rbind(models_summary,models_summary_TEMP)
models_summary_TEMP<-data.frame(Model = numeric(),Accuracy = numeric(),Deviance = numeric())
}
}
run_models(10)