Question

我已经花了几个小时，但是我看不到它。我知道错误在于反向传播部分，因为当我用Andrew Ng的代码替换它时，它就可以工作。但是我无法将他的代码（来自here）与我从3blue1brown（来自here）所遵循的数学（对我而言，这很有意义）联系起来。

问题是我为权重矩阵的导数（dLdW1，dLdW2）获得的形状没有意义。 W1的形状为（4,2），因此dLdW1也应该为（而是2,1）。 W2的形状为（1,4），因此dLdW2也应该为（而是4,400）。因此，我正确地选择了其中一个尺寸，但另一个尺寸却不正确，我不知道为什么。

关于符号的注释：在我写例如dW2的地方，我指的是W2到前一个直项的导数，在这种情况下是Z2。在写dLdW2的地方，我指的是针对成本函数的最终导数（即，整个网络的完整反向支持）。

import numpy as np
np.random.seed(1) 

from planar_utils import plot_decision_boundary, sigmoid, load_planar_dataset, load_extra_datasets
X, Y = load_planar_dataset()

def sigmoid(z):
    """The sigmoid function."""
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    """Derivative of the sigmoid function."""
    return sigmoid(z)*(1-sigmoid(z))

def define_NN_structure(X,Y):
    n_x = X.shape[0] #input neurons
    n_h = 4 #hidden neurons
    n_y = Y.shape[0] #output neurons
    return n_x, n_h, n_y

def initialize_params(n_x,n_h,n_y):
    W1 = np.random.randn(n_h,n_x)*0.01 #n[L] by n[L-1]
    B1 = np.zeros((n_h,1)) #n[L] by 1 
    W2 = np.random.randn(n_y,n_h)*0.01 #n[L] by n[L-1]
    B2 = np.zeros((n_y,1)) #n[L] by 1 
    return W1, B1, W2, B2

def forward_prop(W1, B1, W2, B2):
    Z1 = W1 @ X + B1
    A1 = sigmoid(Z1)
    Z2 = W2 @ A1 + B2
    A2 = sigmoid(Z2)
    return Z1, A1, Z2, A2

def compute_cost(A2, Y):
    m = Y.shape[1]
    L = 1/m * np.sum((A2 - Y)**2) #MSE
    return L

def backward_prop(X, Y, Z1, A1, Z2, A2):
    m = Y.shape[1]
    dA2 = 2*np.abs(A2 - Y) #1,400
    dZ2 = sigmoid_prime(Z2) #1,400
    dW2 = A1 #4,400
    dB2 = np.ones((1,A1.shape[1]))#1,400
    dA1 = W2 #1,4
    dZ1 = sigmoid_prime(Z1) #4,400
    dW1 = X #2,400
    dB1 = np.ones((1,X.shape[1])) #1,400

    dLdW2 = 1/m * dW2 @ dZ2.T @ dA2
    dLdB2 = 1/m * dB2 @ dZ2.T @ dA2
    dLdW1 = 1/m * dW1 @ dZ1.T @ dA1.T @ dZ2 @ dA2.T
    dLdB1 = 1/m * dB1 @ dZ1.T @ dA1.T @ dZ2 @ dA2.T
    return dLdW2, dLdB2, dLdW1, dLdB1, dA2, dZ2, dW2, dB2, dA1, dZ1, dW1, dB1

def update_params(W1, B1, W2, B2, dLdW2, dLdB2, dLdW1, dLdB1, alpha):
    W1 = W1 - alpha * dLdW1
    B1 = B1 - alpha * dLdB1
    W2 = W2 - alpha * dLdW2
    B2 = B2 - alpha * dLdB2    
    return W1, B1, W2, B2 

n_x, n_h, n_y = define_NN_structure(X,Y)
W1, B1, W2, B2 = initialize_params(n_x, n_h, n_y)

for i in range(3000):
    Z1, A1, Z2, A2 = forward_prop(W1, B1, W2, B2)
    L = compute_cost(A2, Y)
    if i % 100 == 0:
        print('Loss is: ' + str(L))
    dLdW2, dLdB2, dLdW1, dLdB1, dA2, dZ2, dW2, dB2, dA1, dZ1, dW1, dB1 = backward_prop(X, Y, Z1, A1, Z2, A2)
    W1, B1, W2, B2 = update_params(W1, B1, W2, B2, dLdW2, dLdB2, dLdW1, dLdB1, 0.2)

Answer 1

更多时间后，我发现了。这是反向传播的正确方程组：

    m = Y.shape[1]
    dA2 = 1/m * 2*(A2 - Y) #1,400
    dZ2 = sigmoid_prime(Z2) #1,400
    dW2 = A1 #4,400
    dB2 = np.ones((1,A1.shape[1]))#1,400
    dA1 = W2 #1,4
    dZ1 = sigmoid_prime(Z1) #4,400
    dW1 = X #2,400
    dB1 = np.ones((1,X.shape[1])) #1,400

    delta2 = dA2 * dZ2 #1,400
    dLdW2 = delta2 @ dW2.T #1,4
    dLdB2 = delta2 @ dB2.T #1,1

    delta1 = delta2.T @ dA1 * dZ1.T #400,4
    dLdW1 = delta1.T @ dW1.T #4,2
    dLdB1 = delta1.T @ dB1.T #4,1

在我的原始方法中有两件事是错误的： 1.证明某些术语需要元素乘而不是矩阵乘（特别是dZ项） 2.条款顺序错误。我确定您可以通过足够的移调来保留原始顺序，但是一旦我理解该怎么做，这就是最简单的方法。

This video很有帮助。

在我的神经网络中的反向传播算法中找不到错误

1 个答案: