我正在尝试使用来自CDC的医疗保健数据在glm
中使用caret
制作r
模型。但是,每当我尝试使用train()
中的caret
命令训练模型时,我都会收到以下错误:
Error in `[.default`(y, , "time") : incorrect number of dimensions
以下是我的代码:
#download data
download.file(url = "ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/dataset_documentation/nhamcs/stata/ed2014-stata.zip",destfile = "ed2014-stata.zip")
unzip("ed2014-stata.zip")
library(haven)
nhamcs2014 <- read_dta("ed2014-stata.dta")
dim(nhamcs2014)
#isolate variables of interest
keep2014<- c("SEX","IMMEDR","SEEN72","CANCER","ETOHAB","ALZHD","ASTHMA","CEBVD","CKD","COPD","CHF","CAD","DEPRN",
"DIABTYP1","DIABTYP2","DIABTYP0","ESRD","HPE","EDHIV","HYPLIPID","HTN","OBESITY","OSA","OSTPRSIS",
"SUBSTAB")
new.nhamcs2014 <- nhamcs2014[keep2014]
#remove missing data
e=new.nhamcs2014$IMMEDR==-9
e.clean.nhamcs2014<- new.nhamcs2014[!e,]
f=e.clean.nhamcs2014$IMMEDR==-8
f.clean.nhamcs2014<- e.clean.nhamcs2014[!f,]
g=f.clean.nhamcs2014$SEEN72==-9
g.clean.nhamcs2014 <- f.clean.nhamcs2014[!g,]
h=g.clean.nhamcs2014$SEEN72==-8
h.clean.nhamcs2014 <- g.clean.nhamcs2014[!h,]
i <- h.clean.nhamcs2014$IMMEDR==7
i.clean.nhamcs2014 <- h.clean.nhamcs2014[!i,]
#Convert response variable (IMMEDR) to binomial variable
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==3] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==2] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==1] <- 0
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==5] <- 1
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR==4] <- 1
#clean data
i.clean.nhamcs2014$SEX[i.clean.nhamcs2014$SEX==1] <- 0
i.clean.nhamcs2014$SEX[i.clean.nhamcs2014$SEX==2] <- 1
i.clean.nhamcs2014$SEEN72[i.clean.nhamcs2014$SEEN72==1] <- 0
i.clean.nhamcs2014$SEEN72[i.clean.nhamcs2014$SEEN72==2] <- 1
View(i.clean.nhamcs2014)
sum(is.na(i.clean.nhamcs2014))
#create glm model using caret
library(caret)
set.seed(1)
inTrain<-createDataPartition(i.clean.nhamcs2014$IMMEDR, p=.75, list = FALSE)
train.nhamcs2014 <- i.clean.nhamcs2014[inTrain,]
test.nhamcs2014 <- i.clean.nhamcs2014[-inTrain,]
control <- trainControl(method = "cv", number = 5, summaryFunction = twoClassSummary,
classProbs = TRUE, verboseIter = TRUE, returnResamp = "final")
model.glm <- train(IMMEDR~.,method = "glm", family = binomial(), metric = "ROC",
maximize = TRUE, data = train.nhamcs2014, trControl = control)
Error in `[.default`(y, , "time") : incorrect number of dimensions
非常感谢任何输入!
答案 0 :(得分:0)
问题出在输入标签中,它是一种笨拙的格式Labelled double
。当您在训练之前将其转换为因子时,它会毫无问题地运行:
在sum(is.na(i.clean.nhamcs2014))
之后运行:
i.clean.nhamcs2014$IMMEDR <- as.character(i.clean.nhamcs2014$IMMEDR)
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR == "0"] <- "zero"
i.clean.nhamcs2014$IMMEDR[i.clean.nhamcs2014$IMMEDR == "1"] <- "one"
i.clean.nhamcs2014$IMMEDR <- factor(i.clean.nhamcs2014$IMMEDR, levels = c("zero", "one"))
然后
set.seed(1)
inTrain<-createDataPartition(i.clean.nhamcs2014$IMMEDR, p=.75, list = FALSE)
train.nhamcs2014 <- i.clean.nhamcs2014[inTrain,]
test.nhamcs2014 <- i.clean.nhamcs2014[-inTrain,]
control <- trainControl(method = "cv", number = 5, summaryFunction = twoClassSummary,
classProbs = TRUE, verboseIter = TRUE, returnResamp = "final")
model.glm <- train(IMMEDR~.,method = "glm", family = binomial(), metric = "ROC",
maximize = TRUE, data = train.nhamcs2014, trControl = control)
> model.glm
Generalized Linear Model
12194 samples
24 predictor
2 classes: 'zero', 'one'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 9756, 9755, 9755, 9755, 9755
Resampling results:
ROC Sens Spec
0.632222 0.8814675 0.1774027