我正在尝试找到Kaggle TFI数据集的解决方案。
但是在使用随机森林在R中构建模型时,我发现了以下错误:
[.data.frame
(数据、、 all.vars(Terms),drop = FALSE)中的错误: 未定义的列已选择
代码:
library(Boruta)
library(caret)
train<-read.csv("train.csv")
names(train)
View(train)
str(train)
test<-read.csv("test.csv")
n.train<-nrow(train)
test$revenue<-1
myData<-rbind(train,test)
rm(train,test)
myData$Open.Date<-as.POSIXlt("01/01/2015", format="%m/%d/%Y") -
as.POSIXlt(myData$Open.Date, format="%m/%d/%Y")
myData$Open.Date<-as.numeric(myData$Open.Date/1000)
myData$City<-as.character(myData$City)
myData$City[myData$City.Group=="Other"]<-"Other"
myData$City[myData$City==unique(myData$City)[4]]<-unique(myData$City)[2]
myData$City<-as.factor(myData$City)
myData$City.Group<-NULL
myData$Type<-as.character(myData$Type)
myData$Type[myData$Type=="DT"]<-"IL"
myData$Type[myData$Type=="MB"]<-"FC"
myData$Type<-as.factor(myData$Type)
hist(train$revenue)
hist(log(train$revenue))
myData[,paste("P",1:37,sep="")]<-log(1+myData[,paste("P",1:37,sep="")])
myData$revenue<-log(myData$revenue)
head(myData,2)
important<-Boruta(revenue~.,data=myData[1:n.train,])
important$finalDecision
library(randomForest)
model<-train(myData$revenue[1:n.train]~.,method="rf",
data=myData[1:n.train,c(important$finalDecision!="Rejected",TRUE)])