在r中的插入符包中使用train命令时出错

时间:2018-05-04 21:22:03

标签: r r-caret training-data

我正在尝试使用来自CDC的医疗保健数据在glm中使用caret制作r模型。但是,每当我尝试使用train()中的caret命令训练模型时,我都会收到以下错误:

Error in `[.default`(y, , "time") : incorrect number of dimensions

以下是我的代码:

#download data
download.file(url = "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/dataset_documentation/nhamcs/stata/ed2014-stata.zip",destfile = "ed2014-stata.zip")
unzip("ed2014-stata.zip")
library(haven)
nhamcs2014 <- read_dta("ed2014-stata.dta")
dim(nhamcs2014)

#isolate variables of interest
keep2014<- c("SEX","IMMEDR","SEEN72","CANCER","ETOHAB","ALZHD","ASTHMA","CEBVD","CKD","COPD","CHF","CAD","DEPRN",
             "DIABTYP1","DIABTYP2","DIABTYP0","ESRD","HPE","EDHIV","HYPLIPID","HTN","OBESITY","OSA","OSTPRSIS",
             "SUBSTAB")
new.nhamcs2014 <- nhamcs2014[keep2014]

#remove missing data
e=new.nhamcs2014$IMMEDR==-9
e.clean.nhamcs2014<- new.nhamcs2014[!e,]
f=e.clean.nhamcs2014$IMMEDR==-8
f.clean.nhamcs2014<- e.clean.nhamcs2014[!f,]
g=f.clean.nhamcs2014$SEEN72==-9
g.clean.nhamcs2014 <- f.clean.nhamcs2014[!g,]
h=g.clean.nhamcs2014$SEEN72==-8
h.clean.nhamcs2014 <- g.clean.nhamcs2014[!h,]
i <- h.clean.nhamcs2014$IMMEDR==7
i.clean.nhamcs2014 <- h.clean.nhamcs2014[!i,]

#Convert response variable (IMMEDR) to binomial variable
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==3] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==2] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==1] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==5] <- 1
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==4] <- 1

#clean data
i.clean.nhamcs2014$SEX[i.clean.nhamcs2014$SEX==1] <- 0
i.clean.nhamcs2014$SEX[i.clean.nhamcs2014$SEX==2] <- 1
i.clean.nhamcs2014$SEEN72[i.clean.nhamcs2014$SEEN72==1] <- 0
i.clean.nhamcs2014$SEEN72[i.clean.nhamcs2014$SEEN72==2] <- 1
View(i.clean.nhamcs2014)
sum(is.na(i.clean.nhamcs2014)) 

#create glm model using caret
library(caret)
set.seed(1)
inTrain<-createDataPartition(i.clean.nhamcs2014$IMMEDR, p=.75, list = FALSE)
train.nhamcs2014 <- i.clean.nhamcs2014[inTrain,]
test.nhamcs2014 <- i.clean.nhamcs2014[-inTrain,]
control <- trainControl(method = "cv", number = 5, summaryFunction = twoClassSummary, 
                        classProbs = TRUE, verboseIter = TRUE, returnResamp = "final")
model.glm <- train(IMMEDR~.,method = "glm", family = binomial(), metric = "ROC", 
                   maximize = TRUE, data = train.nhamcs2014, trControl = control)

Error in `[.default`(y, , "time") : incorrect number of dimensions

非常感谢任何输入!

1 个答案:

答案 0 :(得分:0)

问题出在输入标签中,它是一种笨拙的格式Labelled double。当您在训练之前将其转换为因子时,它会毫无问题地运行:

sum(is.na(i.clean.nhamcs2014))之后运行:

i.clean.nhamcs2014$IMMEDR <- as.character(i.clean.nhamcs2014$IMMEDR)
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR == "0"] <- "zero"
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR == "1"] <- "one"
i.clean.nhamcs2014$IMMEDR <- factor(i.clean.nhamcs2014$IMMEDR, levels = c("zero", "one"))

然后

set.seed(1)
inTrain<-createDataPartition(i.clean.nhamcs2014$IMMEDR, p=.75, list = FALSE)
train.nhamcs2014 <- i.clean.nhamcs2014[inTrain,]
test.nhamcs2014 <- i.clean.nhamcs2014[-inTrain,]
control <- trainControl(method = "cv", number = 5, summaryFunction = twoClassSummary, 
                        classProbs = TRUE, verboseIter = TRUE, returnResamp = "final")
model.glm <- train(IMMEDR~.,method = "glm", family = binomial(), metric = "ROC", 
                   maximize = TRUE, data = train.nhamcs2014, trControl = control)

> model.glm
Generalized Linear Model 

12194 samples
   24 predictor
    2 classes: 'zero', 'one' 

No pre-processing
Resampling: Cross-Validated (5 fold) 
Summary of sample sizes: 9756, 9755, 9755, 9755, 9755 
Resampling results:

  ROC       Sens       Spec     
  0.632222  0.8814675  0.1774027