Question

我的深度神经网络的尺寸面临问题，这会引发错误：

line 104, in linear_backward
dW = (dz @ a.T) * W * lambd / m
ValueError: operands could not be broadcast together with shapes (8,8) (8,32)

在我看来，问题显然在于反向传播，但我找不到我的错误。我事先为凌乱的代码表示歉意，并非常感谢您提供的任何帮助，因为我是一名高中生，周围没有帮助。

程序如下：

import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


# Neural Network Build

def sigmoid(Z):
    # activation function used to classify the output
    # range of the function is between 0 and 1.
    a = (1 + np.exp(-Z)) ** -1
    return a


def relu(Z):
    # activation function used in hidden layers

    A = np.maximum(0, Z)  # outputs Z if Z is positive, otherwise returns 0

    assert (A.shape == Z.shape) #stops program if this isn't true

    return A


def relu_back(dA, Z):
    # function finding the derivative of the relu function
    dZ = np.array(dA, copy=True)

    dZ[Z <= 0] = 0

    return dZ


def sigmoid_back(dA, Z):
    # function finding the derivative of the sigmoid function
    s = sigmoid(Z)

    dZ = dA * s * (1 - s)

    return dZ


def initialize_parameters(layer_dims):
    parameters = {}
    L = len(layer_dims)
    for layer in range(1, L):
        parameters["W" + str(layer)] = np.random.randn(layer_dims[layer], layer_dims[layer - 1]) * np.sqrt(
            2 / layer_dims[layer - 1])
        parameters["b" + str(layer)] = np.zeros((layer_dims[layer], 1))

        print(parameters["W" + str(layer)].shape)

        assert (parameters['W' + str(layer)].shape == (layer_dims[layer], layer_dims[layer - 1]))
        assert (parameters['b' + str(layer)].shape == (layer_dims[layer], 1))

    return parameters


def forward_prop(X, parameters):
    L = len(parameters) // 2

    forward_cache = {}

    forward_cache["z1"] = parameters["W1"] @ X + parameters["b1"]
    forward_cache["a1"] = relu(forward_cache["z1"])

    for layer in range(2, L):
        forward_cache["z" + str(layer)] = parameters["W" + str(layer)] @ forward_cache["a" + str(layer - 1)] + \
                                          parameters["b" + str(layer)]
        forward_cache["a" + str(layer)] = relu(forward_cache["z" + str(layer)])

    # Output neuron will have sigmoid activation function applied to classify the output according to a probability.
    # This needs to be handled separately.
    forward_cache["z" + str(L)] = parameters["W" + str(L)] @ forward_cache["a" + str(L - 1)] + parameters["b" + str(L)]
    forward_cache["a" + str(L)] = sigmoid(forward_cache["z" + str(L)])

    AL = forward_cache["a" + str(L)]

    assert (AL.shape == (1, X.shape[1]))

    return AL, forward_cache # Return y_hat (the model's prediction) and forward_cache (used in back-prop)


def compute_loss(A, Y, parameters, lambd):
    L = len(parameters) // 2
    m = Y.shape[1]
    log_function = -(np.multiply(Y, np.log(A)) + np.multiply((1 - Y), np.log(1 - A)))
    L2_regularisation_cost = 0
    for weight in range(1, L):
        L2_regularisation_cost += np.sum(np.square(parameters["W" + str(weight)]))

    loss = 1. / m * (np.nansum(log_function) + (L2_regularisation_cost * lambd / 2))

    loss = np.squeeze(loss)

    assert (loss.shape == ())

    return loss


def linear_backward(dz, W, b, a, lambd):
    m = a.shape[1]
    dW = (dz @ a.T) * W * lambd / m
    db = np.sum(dz, axis=1, keepdims=True)
    dA_prev = W.T @ dz

    print("W:", W.shape,"dZ:", dz.shape)
    print(dA_prev.shape, a.shape)

#    assert (dA_prev.shape == a.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)

    return dW, db, dA_prev


def activation_backward(dA, a, z, W, b, activation_function, lambd):
    if activation_function == "sigmoid":
        dz = sigmoid_back(dA, z)
    elif activation_function == "relu":
        dz = relu_back(dA, z)

    dW, db, dA_prev = linear_backward(dz, W, b, a, lambd)

    return dW, db, dA_prev


def backward_prop(AL, Y, cache, parameters, lambd):
    L = len(parameters) // 2
    print(AL)
    gradients = {}
    dAL = -(np.divide(Y, AL)) + np.divide(1 - Y, 1 - AL)
    dz = sigmoid_back(dAL, cache["z" + str(L)])
    gradients["dW" + str(L)], gradients["db" + str(L)], dA_prev = \
        linear_backward(dz, parameters["W" + str(L)], parameters["b" + str(L)], cache["a" + str(L)], lambd)

    for l in reversed(range(1, L)):
        gradients["dW" + str(l)], gradients["db" + str(l)], dA_prev = \
            activation_backward(dA_prev, cache["a" + str(l)], cache["z" + str(l)],
                                parameters["W" + str(l)], parameters["b" + str(l)],
                                "relu", lambd)

    return gradients


def update_parameters(parameters, gradients, learning_rate):
    L = len(parameters) // 2

    for l in range(1, L + 1):
        parameters["W" + str(l)] = parameters["W" + str(l)] - (gradients["dW" + str(l)] * learning_rate)
        parameters["b" + str(l)] = parameters["b" + str(l)] - (gradients["db" + str(l)] * learning_rate)

    return parameters


def predict(X, Y, parameters):
    """
        This function is used to predict the results of a  L-layer neural network.

        Arguments:
        X -- data set of examples you would like to label
        parameters -- parameters of the trained model

        Returns:
        p -- predictions for the given dataset X
        """

    m = X.shape[1]
    # n = len(parameters) // 2 number of layers in the neural network
    binary_outcome = np.zeros((1, m))

    # Forward propagation
    probabilities = deep_neural_network(X, Y, parameters, )

    # convert probabilities to 0/1 predictions
    for i in range(0, probabilities.shape[1]):
        if probabilities[0, i] > 0.5:
            binary_outcome[0, i] = 1
        else:
            binary_outcome[0, i] = 0

    # print results
    # print ("predictions: " + str(p))
    # print ("true labels: " + str(y))
    print("Accuracy: " + str(np.sum((binary_outcome == Y) / m)))

    return binary_outcome


def deep_neural_network(X, Y, layer_dims, number_of_iterations, learning_rate, print_cost, lambd):
    losses = []

    for epoch in range(number_of_iterations):

        parameters = initialize_parameters(layer_dims)

        AL, forward_cache = forward_prop(X, parameters)

        loss = compute_loss(AL, Y, parameters, lambd)

        losses.append(loss)

        gradients = backward_prop(AL, Y, forward_cache, parameters, lambd)

        parameters = update_parameters(parameters, gradients, learning_rate)

        if (epoch % 100 == 0) and print_cost:
            print(loss)

    return parameters

    return


cancer = load_breast_cancer()
data = cancer.data
labels = cancer.target

xtrain, xtest, ytrain, ytest = train_test_split(data, labels)

xtrain = xtrain.T
ytrain = ytrain.reshape((1, 426))

parameters, costs = deep_neural_network(xtrain, ytrain, [30, 64, 32, 8, 1], learning_rate=0.045,
                                        number_of_iterations=2000, print_cost=True, lambd=0.01)
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(0.075))
plt.show()
predictions = predict(X=xtrain, parameters=parameters)

Answer 1

已解决：

以下代码有助于实现可自定义的深度神经网络。我的DNN在扩展MNIST上受过训练。

import numpy as np
import matplotlib.pyplot as plt
from emnist import extract_training_samples
from emnist import extract_test_samples
import keras
import time
import os


# Converts data from the emnist database into the required format
# for the neural network and outputs the X and Y datasets
def getData(size, images, labels):
    training_samples = images.shape[0]

    # Converts each matrix of pixels (28x28) into feature vectors
    xtrain = images.reshape(training_samples, 784)

    # Arranges each training example into a column
    xtrain = xtrain.T

    #normalises the database to prevent the exploding gradients problem
    xtrain = xtrain / 255

    ytrain = labels.reshape(1, training_samples)

    # Converts the numerical labels into one-hot arrays with 47 elements.
    # For example a label of 10 will be converted into an array of 0s with
    # a 1 in the 10th position. This can be abstracted as the 10th neuron
    # firing in the neural network.
    y_train = keras.utils.to_categorical(ytrain, 47)

    y_train = y_train[0].T

    # Replaces 1s and 0s with approximate probabilities to prevent NaN errors in
    # the cost entropy function
    y_train[y_train == 0] = 0.01
    y_train[y_train == 1] = 0.99

    # Allows choice of the number of training examples input into the neural network.
    xsmall = np.hsplit(xtrain, [size, (training_samples - size)])[0]
    ysmall = np.hsplit(y_train, [size, (training_samples - size)])[0]

    # sanity check
    print(xsmall.shape, ysmall.shape)

    return xsmall, ysmall


# Activation function that replaces all negative values with 0
def relu(Z):
    A = np.maximum(0, Z)

    assert (A.shape == Z.shape)

    return A


# Derivative of the relu function: 0 if Z is negative and Z if Z is positive
def relu_backward(dA, cache, layer):

    # Accesses Z from the cache set up in the feed_forward function
    Z = cache["Z" + str(layer)]

    dZ = np.array(dA, copy=True)

    # At 0, I've set the derivative to be 0 as well, although it isn't differentiable at that point
    dZ[Z <= 0] = 0

    assert (dZ.shape == Z.shape)

    return dZ

# Activation function on the final layer that converts the linear transformation
# from the layer into probabilities that sum to 1. This is the neural net's estimate of how
# likely it is that the input example is each label in the dataset: the highest probability is
# what we choose as the neural net's output
def softmax(Z):

    exp_Z = np.exp(Z - np.max(Z))

    A = exp_Z / np.sum(exp_Z, axis=0)

    assert (A.shape == Z.shape)

    return A


# Derivative of the softmax function works very well with the cross-entropy cost function,
# which simplifies to the difference between the neural net's guess and the actual label.
def softmax_backward(cache, layer):

    # fetching values from the cache
    Z = cache["Z" + str(layer)]
    AL = cache["A" + str(layer)]
    Y = cache["Y"]

    dZ = AL - Y

    assert (dZ.shape == Z.shape)

    return dZ


# This sets up our parameters depending on the required dimensions of the neural network.
def initialise_parameters(layer_dims):
    parameters = {}

    np.random.seed(2)    
            # He initialisation allows for symmetry breaking and efficient gradient descent
            parameters[W_layer] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(2 / layer_dims[l - 1])
            parameters[b_layer] = np.zeros((layer_dims[l], 1))

            assert (parameters[W_layer].shape == (layer_dims[l], layer_dims[l - 1]))
            assert (parameters[b_layer].shape == (layer_dims[l], 1))

    return parameters


# Outputs Z as the linear transformation of the previous layer's output using the parameters: Weight and bias.
def linear_forward(W, A_previous, b):
    Z = W @ A_previous + b

    assert (Z.shape == (W.shape[0], A_previous.shape[1]))

    return Z


# chooses the activation function that Z passes through depending on the activation_type requested.
def activation_forward(Z, activation_type):
    if activation_type == "relu":
        A = relu(Z)

    elif activation_type == "softmax":
        A = softmax(Z)

    assert (A.shape == Z.shape)

    return A



def feed_forward(X, parameters, Y):
    cache = {}
    A = X

    # Fetches number of layers in the neural network.
    L = len(parameters) // 2

    # considers the feature vector input as the 0th layer, initialising the cache
    cache["A0"] = X
    cache["Y"] = Y

    # Uses the relu for all the hidden layers
    for l in range(1, L + 1):

        # Uses softmax for the output layer
        activation_type = "relu"
        if l == L:
            activation_type = "softmax"

        # Combines all the helper functions above to propagate through the neural network
        A_previous = A
        W = parameters["W" + str(l)]
        b = parameters["b" + str(l)]
        Z = linear_forward(W, A_previous, b)
        A = activation_forward(Z, activation_type)

        # We store these parameters for use in the back propagation, since we use the element inside
        # when differentiating according to the chain rule. Saving these here improves efficiency and speed.
        cache["W" + str(l)] = W
        cache["b" + str(l)] = b
        cache["Z" + str(l)] = Z
        cache["A" + str(l)] = A

    # Returns the final output layer and the cache
    return cache["A" + str(L)], cache


def compute_cost(AL, Y):

    training_samples = Y.shape[1]

    # Zeroes can introduce NaN errors in the cost function, so we approximate them closely
    # to prevent log(0) from occurring and messing up our cost.
    AL_nonzero = np.maximum(AL, 1.0e-15)
    cost = (-1. / training_samples) * (np.sum(np.multiply(Y, np.log(AL_nonzero))) + np.sum(np.multiply(1 - Y, np.log(1 - AL_nonzero))))

    # Removes unnecessary dimensions from the cost value, so [[0.15]] is converted into 0.15
    cost = np.squeeze(cost)
    assert (cost.shape == ())

    return cost


# Performs backward propagation from one layer to the previous depending on
# the activation function specified.
def activation_backward(dAL, cache, layer, activation_type):
    if activation_type == "relu":
        dZ = relu_backward(dAL, cache, layer)

    elif activation_type == "softmax":
        dZ = softmax_backward(cache, layer)

    # Accesses values from cache
    A_prev = cache["A" + str(layer - 1)]
    W = cache["W" + str(layer)]
    b = cache["b" + str(layer)]
    training_samples = A_prev.shape[1]


    dW = (1. / training_samples) * np.dot(dZ, A_prev.T)
    db = (1. / training_samples) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)

    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)

    return dA_prev, dW, db


# Performs backward propagation on the whole neural network to find the derivatives of the
# cost function with respect to the parameters Weight and bias of each layer. These derivatives will
# allow us to perform gradient descent on the the cost function, tuning the parameters to improve accuracy.
def back_propagation(cache, Y, L):
    gradients = {}

    # Accesses output layer matrix from cache
    AL = cache["A" + str(L)]

    # Derivative of the cross entropy cost function
    gradients["dA" + str(L)] = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))

    # Uses relu_backward for all the hidden layers
    for l in reversed(range(1, L + 1)):
        activation_type = "relu"

        # Uses softmax_backward for the output layer
        if l == L:
            activation_type = "softmax"

        # finds gradients of each layer
        dA_previous, dW_current, db_current = activation_backward(gradients["dA" + str(l)], cache, l, activation_type)

        # Stores these gradients in the gradients dictionary
        gradients["dA" + str(l - 1)] = dA_previous
        gradients["dW" + str(l)] = dW_current
        gradients["db" + str(l)] = db_current

    return gradients


# Updates all of the parameters by subtracting their gradients from them
def update_parameters(gradients, parameters, learning_rate):
    L = len(parameters) // 2

    for l in range(1, L):
        W_layer = "W" + str(l)
        b_layer = "b" + str(l)
        dW_layer = "dW" + str(l)
        db_layer = "db" + str(l)

        # The learning rate needs to be tuned to approach the minimum of the cost accurately and reliably
        # while optimising time taken to train the neural net
        parameters[W_layer] = parameters[W_layer] - (gradients[dW_layer] * learning_rate)
        parameters[b_layer] = parameters[b_layer] - (gradients[db_layer] * learning_rate)

    return parameters


# Calculates accuracy of the neural net's prediction against the true labels
def calculate_accuracy(parameters, x, y_onehot, size):

    # Uses parameters to get the neural net's guess
    AL, cache = feed_forward(x, parameters, y_onehot)

    # Finds the highest probability character for each training example
    AL_max = np.argmax(AL, axis=0)

    AL_max = AL_max.reshape(1, AL_max.shape[0])

    # Converts the labels from one-hot matrices back into a numerical vector
    y_int = np.argmax(y_onehot, axis=0)
    y_int = y_int.reshape(1, y_int.shape[0])

    # Compares the two
    accuracy = np.mean(AL_max == y_int)

    return accuracy

# Brings everything together to train the model
def NN_model(X, Y, layer_dims, learning_rate, num_iterations, size, print_values):

    # Accesses initial parameters
    parameters = initialise_parameters(layer_dims)

    # Used to calculate the training time over the iteration
    times = []

    # Used to plot the accuracies at the end
    accuracies = []

    times.append(time.time())

    for i in range(num_iterations):

        # Accesses the output and cache using forward propagation through the parameters
        AL, cache = feed_forward(X, parameters, Y)

        # Calculates the gradients using the previouslt generated cache
        gradients = back_propagation(cache, Y, len(layer_dims) - 1)

        # Tunes the parameters to achieve greater accuracy
        parameters = update_parameters(gradients, parameters, learning_rate)


        # Used to track progress during training
        if i % 10 == 0 and print_values:
            time_difference = time.time() - times[-1]

            times.append(time.time())

            print("Time:", time_difference)

            cost = compute_cost(AL, Y)

            accuracy = calculate_accuracy(parameters, x_test, y_test, size)

            print("Cost on test dataset after iteration %i: %f, accuracy: %f" % (i, cost, accuracy))

            #           costs.append(cost)

            accuracies.append(accuracy)

        if i % 1000 == 0:
            learning_rate = learning_rate * 0.98

    # Plots graph of accuracy to mark improvement
    plt.xlabel('iterations')
    plt.title("Learning rate =" + str(learning_rate))

    line, = plt.plot(np.squeeze(accuracies), label='accuracies')

    plt.xlabel('iterations x10')

    plt.legend()

    plt.show()

    return parameters



size = 30000

images, labels = extract_training_samples('balanced')

x_train, y_train = getData(size, images, labels)

images, labels = extract_test_samples('balanced')

global x_test
global y_test

x_test, y_test = getData(size, images, labels)

layer_dims = [784, 30, 20, 47]

parameters = initialise_parameters(layer_dims)

print(calculate_accuracy(parameters, x_train, y_train, size))

parameters = NN_model(x_train, y_train, layer_dims, 0.03, 40000, size, print_values=True)

print(calculate_accuracy(parameters, x_train, y_train, size))

深度神经网络维度问题

1 个答案: