Question

我目前正在使用numpy从零开始构建我的3-4-1神经网络（我避免使用keras和tensorflow来学习和尝试证明我的知识，而不是使用预先构建的库来完成所有工作），运行该程序时发现的问题是： 1 /在“更新的”权重中经过一定次数的迭代后获得“ nan”值，降低学习率只会延迟问题，无法解决问题。 2 /第二个问题是预测准确性非常低。我想知道是什么原因导致程序中的这些错误，并希望能对您有所帮助。这是代码：

# Import our dependencies
from numpy import exp, array, random, dot, ones_like, where

# Create our Artificial Neural Network class
class ArtificialNeuralNetwork():

    # initializing the class
    def __init__(self):

        # generating the same synaptic weights every time the program runs
        random.seed(1)

        # synaptic weights (3 × 4 Matrix) of the hidden layer 
        self.w_ij = 2 * random.rand(3, 4) - 1

        # synaptic weights (4 × 1 Matrix) of the output layer
        self.w_jk = 2 * random.rand(4, 1) - 1

    def LeakyReLU(self, x):

        # The Leaky ReLU (short for Rectified Linear Unit) activation function will be applied to the inputs of the hidden layer
        # The activation function will return the same value of x if x is positive
        # while it will multiply the negative values of x by the alpha parameter
        # we used in this example the Leaky ReLU instead of the standard ReLU activation function to avoid the dying ReLU problem

        return where(x > 0, x, x * 0.01)

    def LeakyReLUDerivative(self, x, α = 0.01):

        # The Leaky ReLU Derivative will return 1 for every positive value in the x array 
        # while returning the value of the parameter alpha for every negative value

        x[x > 0] = 1 # returns 1 for every positive value in the x array 

        x[x <= 0] = α # returns α for every negative value in the x array

        return x

    def Sigmoid(self, x):

        # The Sigmoid activation function will turn every input value into probabilities between 0 and 1
        # the probabilistic values help us assert which class x belongs to

        return 1 / (1 + exp(-x))

    def SigmoidDerivative(self, x):

        # The derivative of the Sigmoid activation function will be used to calculate the gradient during the backpropagation process
        # and help optimize the random starting synaptic weights

        return x * (1 - x)

    def train(self, x, y, learning_rate, iterations):

        # x: training set of data
        # y: the actual output of the training data

        for i in range(iterations):

            z_ij = dot(x, self.w_ij) # the dot product of the weights of the hidden layer and the inputs
            a_ij = self.LeakyReLU(z_ij) # using the Leaky ReLU activation function to introduce non-linearity to our Neural Network

            z_jk = dot(a_ij, self.w_jk) # the same precedent process will be applied to find the last input of the output layer
            a_jk = self.Sigmoid(z_jk) # this time the Sigmoid activation function will be used instead of Leaky ReLU 

            dl_jk = -y/a_jk + (1 - y)/(1 - a_jk) # calculating the derivative of the cross entropy loss wrt output
            da_jk = self.SigmoidDerivative(a_jk) # calculating the derivative of the Sigmoid activation function wrt the input (before activation) of the output layer
            dz_jk = a_ij # calculating the derivative of the inputs of the hidden layer (before activation) wrt weights of the output layer

            dl_ij = dot(da_jk * dl_jk, self.w_jk.T) # calculating the derivative of the cross entropy loss wrt activated input of the hidden layer
                                                # to do so we multiply the derivative of the cross entropy loss wrt output by the derivative of the Sigmoid activation function wrt the input (before activation) of the output layer by the derivative of the inputs of the hidden layer (before activation) wrt weights of the output layer 
            da_ij = self.LeakyReLUDerivative(z_ij) # calculating the derivative of the Leaky ReLU activation function wrt the inputs of the hidden layer (before activation)
            dz_ij = x # calculating the derivative of the inputs of the hidden layer (before activation) wrt weights of the hidden layer

            # calculating the gradient using the chain rule
            gradient_ij = dot(dz_ij.T , dl_ij * da_ij)
            gradient_jk = dot(dz_jk.T , dl_jk * da_jk)

            # calculating the new optimal weights
            self.w_ij = self.w_ij - learning_rate * gradient_ij 
            self.w_jk = self.w_jk - learning_rate * gradient_jk

    def predict(self, inputs):

        # predicting the class of the input data after weights optimization

        output_from_layer1 = self.LeakyReLU(dot(inputs, self.w_ij)) # the output of the hidden layer

        output_from_layer2 = self.Sigmoid(dot(output_from_layer1, self.w_jk)) # the output of the output layer

        return output_from_layer1, output_from_layer2

    # the function will print the initial starting weights before training
    def SynapticWeights(self):

        print("Layer 1 (4 neurons, each with 3 inputs): ")

        print("w_ij: ", self.w_ij)

        print("Layer 2 (1 neuron, with 4 inputs): ")

        print("w_jk: ", self.w_jk)


def main():

    ANN = ArtificialNeuralNetwork()

    ANN.SynapticWeights()

    # the training inputs 
    x = array([[0, 0, 1], [0, 1, 1], [1, 0, 1], [0, 1, 0], [1, 0, 0], [1, 1, 1], [0, 0, 0]])

    # the training outputs
    y = array([[0, 1, 1, 1, 1, 0, 0]]).T

    ANN.train(x, y, 1, 10000)

    # Printing the new synaptic weights after training
    print("New synaptic weights after training: ")
    print("w_ij: ", ANN.w_ij)
    print("w_jk: ", ANN.w_jk)

    # Our prediction after feeding the ANN with new set of data
    print("Considering new situation [1, 1, 0] -> ?: ")
    print(ANN.predict(array([[1, 1, 0]])))

if __name__=="__main__":
    main()

Answer 1

所以，我改变了几件事。（免责声明：我没有检查代码的正确性）

权重初始化：初始化为更小的权重。

# synaptic weights (3 × 4 Matrix) of the hidden layer
self.w_ij = (2 * random.rand(3, 4) - 1)*0.1

# synaptic weights (4 × 1 Matrix) of the output layer
self.w_jk = (2 * random.rand(4, 1) - 1)*0.1

重量初始化真的很重要。

我将学习率降低到0.1。
```
ANN.train(x, y, .1, 500000)
```

我看到神经网络非常适合您的数据，即使经过500,000次迭代也没有给出Nan。

print(ANN.predict(array([[0, 0, 1],
                         [0, 1, 1],
                         [1, 0, 1],
                         [0, 1, 0],
                         [1, 0, 0],
                         [1, 1, 1],
                         [0, 0, 0]])))

建立一个隐藏的神经网络会导致错误

1 个答案: