我使用glm进行R中的逻辑回归,对我的数据,但数据有完美的分离错误,
1:glm.fit:算法没有收敛 2:glm.fit:拟合概率数值为0或1发生
但是当我自己编写逻辑回归代码并使用optim()(method =" BFGS")对其进行优化时,没有返回错误,一切正常,概率也很均匀均匀地不同于glm中的概率,其中10 ^ -12或1, 为什么会这样?
链接到dataset => https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/downloads/breast-cancer-wisconsin-data.zip
这是我到目前为止所做的,而且在glm中测试集的准确率为100%
#importing data manipulation library
library(dplyr)
#forming train and test datasets by 70 - 30 ratio
train = sample_frac(cancer,0.7)
sid<-as.numeric(rownames(train))
test<-cancer[-sid,]
#the instance matrix and response matrix and the intercepts
x0 = matrix( rep(1),nrow = nrow(train) , ncol = 1)
x1 = matrix( rep(1),nrow = nrow(test) , ncol = 1)
trainX = as.matrix(cbind(x0,train[2:31]))
testX = as.matrix(cbind(x1,test[2:31]))
trainY = as.matrix(train[1])
trainY = as.factor(trainY)
#setting levels to zero and one for trainY
trainY <- ifelse(trainY=="M", 1, 0)
#same for testY
testY = as.matrix(test[1])
testY = as.factor(testY)
testY <- ifelse(testY=="M", 1, 0)
#initialializing some inputs and the logist function
m = nrow(trainX)
Theta = t(matrix( rep(0),nrow = 1 , ncol = ncol(train)))
#computing the cost function to be optimized
costfunction = function(Thetaf,trainX,trainY){
m = nrow(trainX)
s = 1/(1+exp(-(trainX%*%Thetaf)))
ntheta = Thetaf
J = -(1/m)*((t(trainY))%*%log(s)+(1-t(trainY))%*%log(1-log(s)))+ (lambda/(2*m))
probs = 1/(1+exp(-(trainX%*%Thetaf)))
list(J = J, grad = grad)
}
#calling function at initial parameters
cost = costfunction(Theta,trainX,trainY)
#saving the output
initJ = J
initgrads = grad
#optimizing using optim
res <- optim(par = Theta,
fn = function(t) costfunction(t,trainX,trainY)$J,
gr = function(t) costfunction(t,trainX,trainY)$grad,
method = "BFGS", control = list(maxit = 400))
#saving outputs
costfinal = res$value
theta = res$par
#prediction time
testprobs = 1/(1+exp(-(testX%*%theta)))
#testprobs output
[1] 0.999999046 0.970736334 0.009962557 0.047821537 0.977419388
[6] 0.969593889 0.047356574 0.852648764 0.028448420 0.016347660
[11] 0.006836505
#trying out glm
glm.fit = glm(trainY~.,data=train, family = 'binomial')
glm.probs = predict(glm.fit,test,type = 'response')
#glm output probabilities
glm.probs[20:30]
1.000000e+00 1.000000e+00 2.900701e-12 2.900702e-12 1.000000e+00
58 59 66 67 68
1.000000e+00 2.900702e-12 1.000000e+00 2.900702e-12 2.900702e-12
70
2.900702e-12