Question

我觉得我的反向传播直觉还不清楚，所以我写了一个神经网络类来训练/预测XOR。它具有2个输入，1个输出，可变数量的隐藏节点以及用于隐藏和输出层的偏置节点。

我注意到的是，当我将隐藏节点的数量从3更改为2时，无论我为历元/ lr使用什么大小，似乎都无法通过S型激活函数来解决问题。但是，如果我将激活功能更改为tanh，它将仅使用2个隐藏节点进行学习。

我认为可能有三个原因：

无法通过单个隐藏层和仅2个节点来学习XOR（这似乎不太可能，因为我已经看到了许多描述该网络的图形）
不能结合使用S型激活函数和只有两个节点的单个隐藏层来学习XOR
我的代码有错误或没有考虑其他问题

代码如下：

import numpy as np
np.random.seed(42)

class NeuralNetwork:

    def __init__(self, input_nodes, hidden_nodes, output_nodes,
                 lr, activation='sigmoid'):
        self.epoch_count = 0
        self.activation = activation
        self.sigmoid = lambda x: 1 / (1+np.exp(-x))
        if activation == 'tanh': self.nonlinearity = lambda x: np.tanh(x)
        elif activation == 'relu': self.nonlinearity = lambda x: x * (x > 0)
        else: self.nonlinearity = self.sigmoid
        self.inodes = input_nodes
        self.hnodes = hidden_nodes
        self.onodes = output_nodes
        self.wih = np.random.normal(0.0, self.xavier_he_init_std(self.inodes,self.hnodes), (self.hnodes, self.inodes))
        self.who = np.random.normal(0.0, self.xavier_he_init_std(self.inodes,self.hnodes), (self.onodes, self.hnodes))
        self.bo = np.zeros((self.onodes,1))
        self.bh = np.zeros((self.hnodes,1))
        self.lr = lr

    def xavier_he_init_std(self,n_in,n_out):
        if self.activation == 'tanh':
            return np.sqrt(4)*np.sqrt(2/(n_in+n_out))
        elif self.activation == 'relu':
            return np.sqrt(2)*np.sqrt(2/(n_in+n_out))
        else: # sigmoid
            return np.sqrt(2/(n_in+n_out))

    def predict(self,X):
        self.forward(X)
        bin_thresh = 0.5
        probs = self.final_outputs.T
        class_predictions = [1 if i > bin_thresh else 0 for i in probs]
        return class_predictions

    def fit(self, X, y, verbose=True):
        self.forward(X)
        self.backward(y)
        self.epoch_count += 1
        if verbose and (self.epoch_count%100==0):
            print("epoch %4d mse loss:" % self.epoch_count, self.mse_loss)

    def forward(self, X, output=False):
        self.X = np.array(X).T
        hidden_inputs = np.dot(self.wih, self.X) + self.bh # broadcast biases
        self.hidden_outputs = self.nonlinearity(hidden_inputs)
        final_inputs = np.dot(self.who, self.hidden_outputs) + self.bo
        self.final_outputs = self.sigmoid(final_inputs)
        if output: return self.final_outputs

    def backward(self, y):
        y = np.array(y).T
        self.mse_loss = np.square((y-self.final_outputs)).mean()

        # CHAIN RULE FOR WEIGHT UPDATES
        # Hidden-Output layer weights update
        # =================================================================
        # Need to calculate the partial derivative of the loss with respect
        # to each weight in the layer Who
        # partial derivative of the error with respect to the output ∂E/∂O:
        # (1/n)*(targets-predictions)^2 -> (2/n)*(target-actual)*-1 -> (-2/n)*sum(target-actual)
        dEdO_Who = -2*(y-self.final_outputs)
        # partial derivative of the output with respect to the net input ∂O/∂X:
        # 1 / (1 + e^(-N)) -> sigmoid(N)*(1-sigmoid(N)) -> O*(1-O)
        dOdN_Who = self.final_outputs * (1-self.final_outputs)
        # partial derivative of the net inputs with respect to the weights ∂N/∂W:
        # Net_ho = W1*X1+W2+X2... -> X1,X2,...
        dNdW_Who = self.hidden_outputs
        # multiplying each part of the chain rule together:
        dEdW_Who = np.dot((dEdO_Who * dOdN_Who),  dNdW_Who.T) # dot b.c. of batch inputs
        # update the weights by multiplying by the learning rate
        nabla_Who = self.lr * dEdW_Who
        nabla_bo = self.lr * np.dot((dEdO_Who * dOdN_Who), np.expand_dims(np.ones(y.shape),axis=1))

        # Input-Hidden layer weights update: backpropagating errors
        # =================================================================
        # Need to calculate the partial derivative of the loss with respect
        # to each weight in the layer wih
        # similar but slightly different process because the output of each hidden
        # layer neuron contributes to the output/error of multiple output neurons
        # For Example, ∂E/∂Wih_1 = sum_over_outputs(∂E/∂out_o ∂out_o/∂net_o . . .
        # * ∂net_o/∂out_h) * ∂out_h/∂net_h * ∂net_h/∂wih_1
        # This equals sum(δ_o * Who) * out_h * (1 - out_h) * i_1
        # where δ_o = ∂E/∂net = ∂E/∂out * ∂out/∂net
        # errors from hidden layer split output error proportionally by weight contributions
        dEdO_Wih = np.dot(self.who.T, dEdO_Who) # first three p.d. terms
        dOdN_Wih = self.hidden_outputs * (1-self.hidden_outputs)
        dNdW_Wih = self.X # inputs
        dEdW_Wih = np.dot((dEdO_Wih * dOdN_Wih), dNdW_Wih.T)
        nabla_Wih = self.lr * dEdW_Wih
        nabla_bh = self.lr * np.dot((dEdO_Wih * dOdN_Wih), np.expand_dims(np.ones(y.shape),axis=1))

        # update the weights and biases
        self.who -= nabla_Who
        self.wih -= nabla_Wih
        self.bo -= nabla_bo
        self.bh -= nabla_bh

def main():

    X_train = [
        (0, 0),
        (0, 1),
        (1, 0),
        (1, 1)
    ]
    y_train = [0,1,1,0]

    sz = lambda l: 1 if (isinstance(l[0], int) or isinstance(l[0], float)) else len(l[0])
    input_nodes = sz(X_train)
    output_nodes = sz(y_train)

    hidden_nodes = 2
    epochs = 500
    lr=0.1

    nn = NeuralNetwork(input_nodes,hidden_nodes,output_nodes,lr,activation='tanh')

    # train
    for _ in range(epochs):
        nn.fit(X_train, y_train,verbose=True)
    # predict
    print(nn.predict(X_train))

if __name__ == "__main__":
    main()

有人请解释2个隐藏节点+乙状结肠怎么了吗？

用2个隐藏节点和S型激活无法学习XOR NN吗？

0 个答案: