Question

我实施了一个简单的神经网络。它适用于'sigmoid + cross-entropy'，'sigmoid + quadratic cost'和'tanh + quadratic cost'，但它不适用于'tanh + cross-entropy'（不比随机猜测更好）。任何人都可以帮我找出原因吗？只需看看FullConnectedLayer的代码：

class FullConnectedLayer(BaseLayer):
    """
    FullConnectedLayer
    ~~~~~~~~~~~~~~~~~~~~
    Data members: 
    sizes       ---- <type list> sizes of the network
    n_layers    ---- <type int> number of sublayers
    activation  ---- <type Activation> activation function for neurons
    weights     ---- <type list> to store weights
    biases      ---- <type list> to store biases
    neurons     ---- <type list> to store states (outputs) of neurons
    zs          ---- <type list> to store weighted inputs to neurons
    grad_w      ---- <type list> to store gradient of Cost w.r.t weights
    grad_b      ---- <type list> to store gradient of Cost w.r.t biases
    ---------------------
    Methods:
    __init__(self, sizes, activation = Sigmoid())
    size(self)
    model(self)
    feedforward(self, a)
    backprop(self, C_p)
    update(self, eta, lmbda, batch_size, n)
    """

    def __init__(self, sizes, activation = Sigmoid(), normal_initialization = False):
        """
        The list ''sizes'' contains the number of neurons in repective layers
        of the network. For example, sizes = [2, 3, 2] represents 3 layers, with
        the first layer having 2 neurons, the second 3 neurons, and the third 2 
        neurons.

        Note that the input layer may be passed by other layer of another type 
        when connected after the layer, and we don't set biases for this layer.
        Also note that the output layer my be passed to other layer if connected
        before the layer, in this case, just assign the outputs to its inputs.
        For examle, Layer1([3, 2, 4])->Layer2([4, 6, 3])->Layer3([3, 2]). Just
        assign the output of Layer1 to the input Layer2, it will be safe.
        """

        BaseLayer.__init__(self, sizes, activation)

        if normal_initialization:
            self.weights = [np.random.randn(j, i)
                    for i, j in zip(sizes[:-1], sizes[1:])]
        else:
            self.weights = [np.random.randn(j, i) / np.sqrt(i)
                    for i, j in zip(sizes[:-1], sizes[1:])]
        self.biases = [np.random.randn(j, 1) for j in sizes[1:]]

        self.grad_w = [np.zeros(w.shape) for w in self.weights]
        self.grad_b = [np.zeros(b.shape) for b in self.biases]

    def feedforward(self, a):
        """
        Return output of the network if ''a'' is input.
        """
        self.neurons = [a] # to store activations (outputs) of all layers
        self.zs = []
        for w, b in zip(self.weights, self.biases):
            z = np.dot(w, self.neurons[-1]) + b
            self.zs.append(z)
            self.neurons.append(self.activation.func(z))
        return self.neurons[-1]


    def backprop(self, Cp_a):
        """
        Backpropagate the delta error.
        ------------------------------
        Return a tuple whose first component is a list of the gradients of 
        weights and biases, whose second component is the backpropagated delta.
        Cp_a, dC/da: derivative of cost function w.r.t a, output of neurons. 
        """
        # The last layer
        delta = Cp_a * self.activation.prime(self.zs[-1])
        self.grad_b[-1] += delta
        self.grad_w[-1] += np.dot(delta, self.neurons[-2].transpose()) 

        for l in range(2, self.n_layers):
            sp = self.activation.prime(self.zs[-l])  # a.prime(z)
            delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp  
            self.grad_b[-l] += delta
            self.grad_w[-l] += np.dot(delta, self.neurons[-l - 1].transpose())

        Cp_a_out = np.dot(self.weights[0].transpose(), delta)

        return Cp_a_out

    def update(self, eta, lmbda, batch_size, n):
        """
        Update the network's weights and biases by applying gradient descent
        algorithm.
        ''eta'' is the learning rate
        ''lmbda'' is the regularization parameter
        ''n'' is the total size of the training data set
        """
        self.weights = [(1 - eta * (lmbda/n)) * w - (eta/batch_size) * delta_w\
                for w, delta_w in zip(self.weights, self.grad_w)]
        self.biases = [ b - (eta / batch_size) * delta_b\
                for b, delta_b in zip(self.biases, self.grad_b)]

        # Clear ''grad_w'' and ''grad_b'' so that they are not added to the 
        # next update pass
        for dw, db in zip(self.grad_w, self.grad_b):
            dw.fill(0)
            db.fill(0)

以下是tanh函数的代码：

class Tanh(Activation):

    @staticmethod
    def func(z):
        """ The functionality. """
        return (np.exp(z) - np.exp(-z)) / (np.exp(z) + np.exp(-z))

    @staticmethod
    def prime(z):
        """ The derivative. """
        return 1. - Tanh.func(z) ** 2

这是交叉熵类的代码：

class CrossEntropyCost(Cost):

    @staticmethod
    def func(a, y):
        """
        Return the cost associated with an output ''a'' and desired output
        ''y''. 
        Note that np.nan_to_num is used to ensure numerical stability. In
        particular, if both ''a'' and ''y'' have a 1.0 in the same slot, 
        then the expression (1-y) * np.log(1-a) returns nan. The np.nan_to_num
        ensures that that is converted to the correct value(0.0).
        """
        for ai in a:
            if ai < 0:
                print("in CrossEntropyCost.func(a, y)... require a_i > 0, a_i belong to a.")
                exit(1)

        return np.sum(np.nan_to_num(-y * np.log(a) - (1-y) * np.log(1-a)))

    @staticmethod
    def Cp_a(a, y):
        """
        Cp_a, dC/da: the derivative of C w.r.t a
        ''a'' is the output of neurons
        ''y'' is the expected output of neurons
        """
        #return (a - y) # delta
        return (a - y) / (a * (1 - a))

编辑：似乎问题是tanh的范围是-1到+1，这对于交叉熵是非法的。但如果我只想要tanh激活和交叉熵成本，我应该如何处理呢？

Answer 1

答案很晚，但我认为值得一提。

如果你想使用tanh激活函数，而不是使用交叉熵成本函数，你可以修改它以提供介于-1和1之间的输出。

同样的看法如下：

（（1 + y）/ 2 * log（a））+（（1-y）/ 2 * log（1-a））

使用此作为成本函数将允许您使用tanh激活。

Answer 2

您的输出图层似乎正在使用tanh，其中tanh的范围为-1, +1，预期的输出范围为0, +1。这对Sigmoid确实产生0, +1范围内的输出无关紧要。

Answer 3

交叉熵期望它的输入是logits，其范围为0到1。 Tanh方法将输入转换为-1到1范围内的值，交叉熵无法处理。

一些可能的修复方法是重新缩放输入中最后一层的输入是tanh和成本交叉熵。

以'tanh'作为激活和'交叉熵'作为成本函数的神经网络不起作用

3 个答案: