predict()R函数插入符错误:“newdata”行不同,“类型”不被接受

时间:2016-05-27 07:10:29

标签: r machine-learning logistic-regression r-caret training-data

  • 我正在使用插入符号包进行逻辑回归分析。

  • 数据输入为18x6矩阵

  • predict()函数外,一切都很好。

  • R告诉我type参数应该是rawprob,但raw只是吐出最后一列的精确副本(二项式变量的值)。 prob给了我以下错误:

  

“dimnames中的错误(out)[[2]]< - modelFit $ obsLevels:     'dimnames'[2]的长度不等于数组范围   另外:警告信息:   'newdata'有7行,但找到的变量有18行“

install.packages("pbkrtest")
install.packages("caret")
install.packages('e1071', dependencies=TRUE)
#install.packages('caret', dependencies = TRUE)
require(caret)
library(caret)

A=matrix(
  c(
    64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946,66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627,68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755,69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500,73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1
  ),

  nrow = 18,
  ncol = 6,
  byrow = FALSE)  #"bycol" does NOT exist
################### data set as vectors
a<-c(64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946)
b<-c(66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627)
c<-c(68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755)
d<-c(69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500)
e<-c(73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500)
f<-c(1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1)
######################

n<-nrow(A);
K<-ncol(A)-1;

Train <- createDataPartition(f, p=0.6, list=FALSE)  #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]
nrow(training)

#this is the logistic formula:
#estimates from logistic regression characterize the relationship between the predictor and response variable on a log-odds scale
mod_fit <- train(f ~ a + b + c + d +e,  data=training, method="glm", family="binomial")
mod_fit

#this isthe exponential function to calculate the odds ratios for each preditor:
exp(coef(mod_fit$finalModel))

predict(mod_fit, newdata=training)
predict(mod_fit, newdata=testing, type="prob")

3 个答案:

答案 0 :(得分:0)

我不太清楚,但是A是(a,b,c,d,e,f)的矩阵。所以你不需要创建两个对象。

install.packages("pbkrtest")
install.packages("caret")
install.packages('e1071', dependencies=TRUE)
#install.packages('caret', dependencies = TRUE)
require(caret)
library(caret)

A=matrix(
  c(
        64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946,66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627,68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755,69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500,73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500,1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1
  ),
  nrow = 18,
  ncol = 6,
  byrow = FALSE)  #"bycol" does NOT exist

A <- data.frame(A)
colnames(A) <- c('a','b','c','d','e','f')
A$f <- as.factor(A$f)

Train <- createDataPartition(A$f, p=0.6, list=FALSE)  #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]
nrow(training)

要预测变量,您必须输入解释变量,而不是预测变量

mod_fit <- train(f ~ a + b + c + d +e,  data=training, method="glm", family="binomial")
mod_fit

#this isthe exponential function to calculate the odds ratios for each preditor:
exp(coef(mod_fit$finalModel))

predict(mod_fit, newdata=training[,-which(colnames(training)=="f")])
predict(mod_fit, newdata=testing[,-which(colnames(testing)=="f")])

答案 1 :(得分:0)

简短回答,您不应在f等式中包含已解释的变量predict。所以你应该这样做:

predict(mod_fit, newdata=training[, -ncol(training])
predict(mod_fit, newdata=testing[, -ncol(testing])

警告消息'newdata' had 11 rows but variables found have 18 rows的问题是因为您使用整个数据集(18个观察值)运行回归,但仅使用其中的一部分(11或7)进行预测。

编辑:为了简化数据创建和我们可以执行的glm流程:

library(caret)
A <- data.frame(a = c(64830,18213,4677,24761,9845,17504,22137,12531,5842,28827,51840,4079,1000,2069,969,9173,11646,946),
                b = c(66161,18852,5581,27219,10159,17527,23402,11409,8115,31425,55993,0,0,1890,1430,7873,12779,627),
                c = c(68426,18274,5513,25687,10971,14104,19604,13438,6011,30055,57242,0,0,2190,1509,8434,10492,755),
                d = c(69716,18366,5735,26556,11733,16605,20644,15516,5750,31116,64330,0,0,1850,1679,9233,12000,500),
                e = c(73128,18906,5759,28555,11951,19810,22086,17425,6152,28469,72020,0,0,1400,1750,8599,12000,500),
                f = c(1,1,1,0,1,0,0,0,0,1,0,1,1,1,1,1,1,1))

Train <- createDataPartition(f, p=0.6, list=FALSE)  #60% of data set is used as training.
training <- A[ Train, ]
testing <- A[ -Train, ]

mod_fit <- train(f ~ a + b + c + d + e,  data=training, method="glm", family="binomial")

答案 2 :(得分:0)

我尝试运行逻辑回归模型。我写了这段代码:

vtk_volume_mapper_->RemoveAllClippingPlanes();
vtk_volume_mapper_->RemoveAllInputs();

vtk_volume_mapper_->ReleaseGraphicsResources(vtk_render_window_);
vtk_volume_->ReleaseGraphicsResources(vtk_render_window_);
vtk_renderer_->ReleaseGraphicsResources(vtk_render_window_);

我收到了最后一条命令的消息:

install.packages('caret')
library(caret)
setwd('C:\\Users\\BAHOZ\\Documents\\')
D<-read.csv(file = "D.csv",header = T)
D<-read.csv(file = 'DataSet.csv',header=T)
names(D)
set.seed(111134)
Train<-createDataPartition(D$X, p=0.7,list = FALSE)
training<-D[Train,]
length(training$age)
testing<-D[-Train,]
length(testing$age)
mod_fit<-train(X~age + gender  + total.Bilirubin + direct.Bilirubin + total.proteins + albumin + A.G.ratio+SGPT + SGOT + Alkphos,data=training,method="glm", family="binomial")
summary(mod_fit)
exp(coef(mod_fit$finalModel))

通过运行此命令,我可以预测我的数据,

     (Intercept)              age           gender  total.Bilirubin direct.Bilirubin   total.proteins          albumin        A.G.ratio 
  0.01475027       1.01596886       1.03857883       1.00022899       1.78188072       1.00065332       1.01380334       1.00115742 
        SGPT             SGOT          Alkphos 
  3.93498241       0.05616662      38.29760014 

但是如果我设置了predict(mod_fit , newdata=testing) type="prob"

type="raw"

它出错了:

  

dimnames(out)中的错误<-predict(mod_fit , newdata=testing, type = "prob")

     

'dimnames'[2]的长度不等于数组范围