Question

一开始，我是日本人，不擅长英语，所以我的判决可能不正确。抱歉。我制作了关于逻辑回归的代码。我用数据集测试了这段代码，但这不合适。这就是为什么这段代码中有任何错误。如果是的话，请告诉我。此外，我想知道绘制数据和决策边界的方法。

class logisticr(object):

def __init__(self, eta=0.01):
    import numpy as np
    from numpy import random
    self.eta = eta

def sigFunc(self, z):
    return 1.0 / (1.0 + np.exp( -z ))

def predict(self, X):
    X = np.matrix(X)
    m, n = X.shape
    X = np.c_[np.matrix(np.ones((m, 1))), X]
    z=X*self.w_
    phi=self.sigFunc(z)
    return self.decide(phi)

def decide(self, x):
    return np.where(x >= 0.5, 1, 0)

def costfunc(self, X):
    z = X * self.w_
    phi =self.sigFunc(z)
    J = -y.T * np.log(phi) - (np.ones((m, 1)) - y).T * np.log(np.ones((m, 1)) - phi)
    return J

def fit(self, X, y):
    X = np.matrix(X).T
    m, n = X.shape
    print "the number of futures is %d" %n
    X = np.c_[np.matrix(np.ones((m, 1))), X]
    y = np.matrix(y).T
    self.w_ = np.matrix(np.zeros((n + 1, 1)))
    for xi, yi in zip(X, y):
        zi = xi * self.w_
        phii = self.sigFunc(zi)
        gradJi = -xi.T * (yi - phii)
        self.w_ -=  self.eta * gradJi
        self.eta *= 0.1
    print "final parameter is (%d, %d)" %(self.w_[0], self.w_[1])
    z = X * self.w_
    phi =self.sigFunc(z)
    correctAnswer = np.where(np.array(y == self.decide(phi)) == True, 1, 0)
    return float(sum(correctAnswer)) / len(correctAnswer)

Answer 1

我稍微修改了你的代码。拟合现在以两种方式解决：作为简单的梯度下降和通过scipy优化器。梯度下降非常缓慢，不应用于实际问题。我在Coursera

的机器学习课程中测试了课程

我不确定我可以分享原始数据集。以下是它的一小部分：

34.62365962451697,78.0246928153624,0
30.28671076822607,43.89499752400101,0
35.84740876993872,72.90219802708364,0
60.18259938620976,86.30855209546826,1
79.0327360507101,75.3443764369103,1
45.08327747668339,56.3163717815305,0
61.10666453684766,96.51142588489624,1

以下是两种解决方案的输出：

简单梯度下降（经过100000次迭代......）

Number of Features: 3
Found Solution:
[[-4.81180027]
 [ 0.04528064]
 [ 0.03819149]]
Train Accuracy: 0.910000

Scipy Optimizer （非常快）

Number of Features: 3
Found Solution:
[[-25.16131869]
 [  0.20623159]
 [  0.20147149]]
Train Accuracy: 0.890000

此代码中的决策边界仅适用于相同类型的问题（分隔两组的直线）。为了获得更好的准确性，您可以尝试将这些功能组合作为新功能（请注意可能的过度拟合）。

以下是代码：

import numpy as np
from numpy import *
import scipy.optimize as op
import matplotlib.pyplot as plt

class Logisticr():

    def __init__(self, X, y, alg, eta, w):
        self.X = X
        self.y = y
        self.alg = alg
        self.eta = eta
        self.w = w
        self.m, self.n = np.shape(X)

    def sigFunc(self, z):
        return 1.0 / (1.0 + np.exp( -z ))

    def decide(self, x):
        return np.where(x >= 0.5, 1, 0)

    def costfunc(self, w, X, y):

        w = w.reshape((self.n,1))
        z = X * w
        phi =self.sigFunc(z)

        # calculating the cost function
        part1 = np.multiply(y, np.log(phi))
        part2 = np.multiply((1 - y), np.log(1 - phi))

        J = (-part1 - part2).sum()/self.m

        # calculating the gradient
        grad = X.T * (phi - y) / self.m        

        return J, grad    

    def graddescent(self, maxiter):

        for i in range(0, maxiter):
            J, grad = self.costfunc(self.w, self.X, self.y)
            self.w = self.w - self.eta*grad

        return self.w

    def fit(self):

        print "Number of Features: %d" %self.n

        if self.alg == 0:
            _maxiter = 100000
            self.w = self.graddescent(_maxiter)
        else:
            Result = op.minimize(fun = self.costfunc, 
                                 x0 = self.w, 
                                 args = (self.X, self.y),
                                 method = 'TNC',
                                 jac = True);

            self.w = Result.x
            self.w = np.matrix(self.w).T

        print "Found Solution:" 
        print self.w

        z = self.X * self.w
        phi = self.sigFunc(z)
        correctAnswer = np.where(np.array(self.y == self.decide(phi)) == True, 1, 0)

        accuracy = float(sum(correctAnswer)) / len(correctAnswer)

        print "Train Accuracy: %f" %accuracy

    def plot(self):

        if self.n == 3:
            ind_1 = np.where(self.y == 1)
            ind_0 = np.where(self.y == 0)

            x1_1 = self.X[:, [1]].min()
            x1_2 = self.X[:, [1]].max()

            x2_1 = -(self.w[0, 0] + self.w[1, 0]*x1_1)/self.w[2, 0]
            x2_2 = -(self.w[0, 0] + self.w[1, 0]*x1_2)/self.w[2, 0]

            plt.plot(self.X[ind_1, [1]], self.X[ind_1, [2]], "bo", markersize=3)
            plt.plot(self.X[ind_0, [1]], self.X[ind_0, [2]], "ro", markersize=3)

            plt.plot([x1_1, x1_2], [x2_1, x2_2], "g-")

            plt.xlabel("Feature 1")
            plt.ylabel("Feature 2")
            plt.title("Decision boundary")
            plt.show()

        return 1

# Main module
_f = loadtxt('data/ex2data1.txt', delimiter=',')

_X, _y = _f[:,[0,1]], _f[:,[2]]

_m = np.shape(_X)[0]

# add a column of 1
_X = np.hstack((np.matrix(np.ones((_m, 1))),_X))
_y = np.matrix(_y)

_alg = 1   # 0 = using simple gradient descent
           # 1 = using an optimizer

_eta = 0.001  # initial eta value for _type = 0

_n= np.shape(_X)[1]

_w = np.matrix(np.zeros((_n, 1)))

# creating an instance of the Logisticr        
lr = Logisticr(_X, _y, _alg, _eta, _w);
lr.fit()
lr.plot()

<强>更新

我尝试使用引用的数据集。它看起来像只使用温度和湿度的类是不可分离的。即使您使用高阶特征组合，您也可能会过度使用分类器。你不这么认为吗？

Logistic回归的实现

1 个答案: