我的深度神经网络的尺寸面临问题,这会引发错误:
line 104, in linear_backward
dW = (dz @ a.T) * W * lambd / m
ValueError: operands could not be broadcast together with shapes (8,8) (8,32)
在我看来,问题显然在于反向传播,但我找不到我的错误。 我事先为凌乱的代码表示歉意,并非常感谢您提供的任何帮助,因为我是一名高中生,周围没有帮助。
程序如下:
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
# Neural Network Build
def sigmoid(Z):
# activation function used to classify the output
# range of the function is between 0 and 1.
a = (1 + np.exp(-Z)) ** -1
return a
def relu(Z):
# activation function used in hidden layers
A = np.maximum(0, Z) # outputs Z if Z is positive, otherwise returns 0
assert (A.shape == Z.shape) #stops program if this isn't true
return A
def relu_back(dA, Z):
# function finding the derivative of the relu function
dZ = np.array(dA, copy=True)
dZ[Z <= 0] = 0
return dZ
def sigmoid_back(dA, Z):
# function finding the derivative of the sigmoid function
s = sigmoid(Z)
dZ = dA * s * (1 - s)
return dZ
def initialize_parameters(layer_dims):
parameters = {}
L = len(layer_dims)
for layer in range(1, L):
parameters["W" + str(layer)] = np.random.randn(layer_dims[layer], layer_dims[layer - 1]) * np.sqrt(
2 / layer_dims[layer - 1])
parameters["b" + str(layer)] = np.zeros((layer_dims[layer], 1))
print(parameters["W" + str(layer)].shape)
assert (parameters['W' + str(layer)].shape == (layer_dims[layer], layer_dims[layer - 1]))
assert (parameters['b' + str(layer)].shape == (layer_dims[layer], 1))
return parameters
def forward_prop(X, parameters):
L = len(parameters) // 2
forward_cache = {}
forward_cache["z1"] = parameters["W1"] @ X + parameters["b1"]
forward_cache["a1"] = relu(forward_cache["z1"])
for layer in range(2, L):
forward_cache["z" + str(layer)] = parameters["W" + str(layer)] @ forward_cache["a" + str(layer - 1)] + \
parameters["b" + str(layer)]
forward_cache["a" + str(layer)] = relu(forward_cache["z" + str(layer)])
# Output neuron will have sigmoid activation function applied to classify the output according to a probability.
# This needs to be handled separately.
forward_cache["z" + str(L)] = parameters["W" + str(L)] @ forward_cache["a" + str(L - 1)] + parameters["b" + str(L)]
forward_cache["a" + str(L)] = sigmoid(forward_cache["z" + str(L)])
AL = forward_cache["a" + str(L)]
assert (AL.shape == (1, X.shape[1]))
return AL, forward_cache # Return y_hat (the model's prediction) and forward_cache (used in back-prop)
def compute_loss(A, Y, parameters, lambd):
L = len(parameters) // 2
m = Y.shape[1]
log_function = -(np.multiply(Y, np.log(A)) + np.multiply((1 - Y), np.log(1 - A)))
L2_regularisation_cost = 0
for weight in range(1, L):
L2_regularisation_cost += np.sum(np.square(parameters["W" + str(weight)]))
loss = 1. / m * (np.nansum(log_function) + (L2_regularisation_cost * lambd / 2))
loss = np.squeeze(loss)
assert (loss.shape == ())
return loss
def linear_backward(dz, W, b, a, lambd):
m = a.shape[1]
dW = (dz @ a.T) * W * lambd / m
db = np.sum(dz, axis=1, keepdims=True)
dA_prev = W.T @ dz
print("W:", W.shape,"dZ:", dz.shape)
print(dA_prev.shape, a.shape)
# assert (dA_prev.shape == a.shape)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
return dW, db, dA_prev
def activation_backward(dA, a, z, W, b, activation_function, lambd):
if activation_function == "sigmoid":
dz = sigmoid_back(dA, z)
elif activation_function == "relu":
dz = relu_back(dA, z)
dW, db, dA_prev = linear_backward(dz, W, b, a, lambd)
return dW, db, dA_prev
def backward_prop(AL, Y, cache, parameters, lambd):
L = len(parameters) // 2
print(AL)
gradients = {}
dAL = -(np.divide(Y, AL)) + np.divide(1 - Y, 1 - AL)
dz = sigmoid_back(dAL, cache["z" + str(L)])
gradients["dW" + str(L)], gradients["db" + str(L)], dA_prev = \
linear_backward(dz, parameters["W" + str(L)], parameters["b" + str(L)], cache["a" + str(L)], lambd)
for l in reversed(range(1, L)):
gradients["dW" + str(l)], gradients["db" + str(l)], dA_prev = \
activation_backward(dA_prev, cache["a" + str(l)], cache["z" + str(l)],
parameters["W" + str(l)], parameters["b" + str(l)],
"relu", lambd)
return gradients
def update_parameters(parameters, gradients, learning_rate):
L = len(parameters) // 2
for l in range(1, L + 1):
parameters["W" + str(l)] = parameters["W" + str(l)] - (gradients["dW" + str(l)] * learning_rate)
parameters["b" + str(l)] = parameters["b" + str(l)] - (gradients["db" + str(l)] * learning_rate)
return parameters
def predict(X, Y, parameters):
"""
This function is used to predict the results of a L-layer neural network.
Arguments:
X -- data set of examples you would like to label
parameters -- parameters of the trained model
Returns:
p -- predictions for the given dataset X
"""
m = X.shape[1]
# n = len(parameters) // 2 number of layers in the neural network
binary_outcome = np.zeros((1, m))
# Forward propagation
probabilities = deep_neural_network(X, Y, parameters, )
# convert probabilities to 0/1 predictions
for i in range(0, probabilities.shape[1]):
if probabilities[0, i] > 0.5:
binary_outcome[0, i] = 1
else:
binary_outcome[0, i] = 0
# print results
# print ("predictions: " + str(p))
# print ("true labels: " + str(y))
print("Accuracy: " + str(np.sum((binary_outcome == Y) / m)))
return binary_outcome
def deep_neural_network(X, Y, layer_dims, number_of_iterations, learning_rate, print_cost, lambd):
losses = []
for epoch in range(number_of_iterations):
parameters = initialize_parameters(layer_dims)
AL, forward_cache = forward_prop(X, parameters)
loss = compute_loss(AL, Y, parameters, lambd)
losses.append(loss)
gradients = backward_prop(AL, Y, forward_cache, parameters, lambd)
parameters = update_parameters(parameters, gradients, learning_rate)
if (epoch % 100 == 0) and print_cost:
print(loss)
return parameters
return
cancer = load_breast_cancer()
data = cancer.data
labels = cancer.target
xtrain, xtest, ytrain, ytest = train_test_split(data, labels)
xtrain = xtrain.T
ytrain = ytrain.reshape((1, 426))
parameters, costs = deep_neural_network(xtrain, ytrain, [30, 64, 32, 8, 1], learning_rate=0.045,
number_of_iterations=2000, print_cost=True, lambd=0.01)
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(0.075))
plt.show()
predictions = predict(X=xtrain, parameters=parameters)
答案 0 :(得分:0)
已解决:
以下代码有助于实现可自定义的深度神经网络。我的DNN在扩展MNIST上受过训练。
import numpy as np
import matplotlib.pyplot as plt
from emnist import extract_training_samples
from emnist import extract_test_samples
import keras
import time
import os
# Converts data from the emnist database into the required format
# for the neural network and outputs the X and Y datasets
def getData(size, images, labels):
training_samples = images.shape[0]
# Converts each matrix of pixels (28x28) into feature vectors
xtrain = images.reshape(training_samples, 784)
# Arranges each training example into a column
xtrain = xtrain.T
#normalises the database to prevent the exploding gradients problem
xtrain = xtrain / 255
ytrain = labels.reshape(1, training_samples)
# Converts the numerical labels into one-hot arrays with 47 elements.
# For example a label of 10 will be converted into an array of 0s with
# a 1 in the 10th position. This can be abstracted as the 10th neuron
# firing in the neural network.
y_train = keras.utils.to_categorical(ytrain, 47)
y_train = y_train[0].T
# Replaces 1s and 0s with approximate probabilities to prevent NaN errors in
# the cost entropy function
y_train[y_train == 0] = 0.01
y_train[y_train == 1] = 0.99
# Allows choice of the number of training examples input into the neural network.
xsmall = np.hsplit(xtrain, [size, (training_samples - size)])[0]
ysmall = np.hsplit(y_train, [size, (training_samples - size)])[0]
# sanity check
print(xsmall.shape, ysmall.shape)
return xsmall, ysmall
# Activation function that replaces all negative values with 0
def relu(Z):
A = np.maximum(0, Z)
assert (A.shape == Z.shape)
return A
# Derivative of the relu function: 0 if Z is negative and Z if Z is positive
def relu_backward(dA, cache, layer):
# Accesses Z from the cache set up in the feed_forward function
Z = cache["Z" + str(layer)]
dZ = np.array(dA, copy=True)
# At 0, I've set the derivative to be 0 as well, although it isn't differentiable at that point
dZ[Z <= 0] = 0
assert (dZ.shape == Z.shape)
return dZ
# Activation function on the final layer that converts the linear transformation
# from the layer into probabilities that sum to 1. This is the neural net's estimate of how
# likely it is that the input example is each label in the dataset: the highest probability is
# what we choose as the neural net's output
def softmax(Z):
exp_Z = np.exp(Z - np.max(Z))
A = exp_Z / np.sum(exp_Z, axis=0)
assert (A.shape == Z.shape)
return A
# Derivative of the softmax function works very well with the cross-entropy cost function,
# which simplifies to the difference between the neural net's guess and the actual label.
def softmax_backward(cache, layer):
# fetching values from the cache
Z = cache["Z" + str(layer)]
AL = cache["A" + str(layer)]
Y = cache["Y"]
dZ = AL - Y
assert (dZ.shape == Z.shape)
return dZ
# This sets up our parameters depending on the required dimensions of the neural network.
def initialise_parameters(layer_dims):
parameters = {}
np.random.seed(2)
# He initialisation allows for symmetry breaking and efficient gradient descent
parameters[W_layer] = np.random.randn(layer_dims[l], layer_dims[l - 1]) * np.sqrt(2 / layer_dims[l - 1])
parameters[b_layer] = np.zeros((layer_dims[l], 1))
assert (parameters[W_layer].shape == (layer_dims[l], layer_dims[l - 1]))
assert (parameters[b_layer].shape == (layer_dims[l], 1))
return parameters
# Outputs Z as the linear transformation of the previous layer's output using the parameters: Weight and bias.
def linear_forward(W, A_previous, b):
Z = W @ A_previous + b
assert (Z.shape == (W.shape[0], A_previous.shape[1]))
return Z
# chooses the activation function that Z passes through depending on the activation_type requested.
def activation_forward(Z, activation_type):
if activation_type == "relu":
A = relu(Z)
elif activation_type == "softmax":
A = softmax(Z)
assert (A.shape == Z.shape)
return A
def feed_forward(X, parameters, Y):
cache = {}
A = X
# Fetches number of layers in the neural network.
L = len(parameters) // 2
# considers the feature vector input as the 0th layer, initialising the cache
cache["A0"] = X
cache["Y"] = Y
# Uses the relu for all the hidden layers
for l in range(1, L + 1):
# Uses softmax for the output layer
activation_type = "relu"
if l == L:
activation_type = "softmax"
# Combines all the helper functions above to propagate through the neural network
A_previous = A
W = parameters["W" + str(l)]
b = parameters["b" + str(l)]
Z = linear_forward(W, A_previous, b)
A = activation_forward(Z, activation_type)
# We store these parameters for use in the back propagation, since we use the element inside
# when differentiating according to the chain rule. Saving these here improves efficiency and speed.
cache["W" + str(l)] = W
cache["b" + str(l)] = b
cache["Z" + str(l)] = Z
cache["A" + str(l)] = A
# Returns the final output layer and the cache
return cache["A" + str(L)], cache
def compute_cost(AL, Y):
training_samples = Y.shape[1]
# Zeroes can introduce NaN errors in the cost function, so we approximate them closely
# to prevent log(0) from occurring and messing up our cost.
AL_nonzero = np.maximum(AL, 1.0e-15)
cost = (-1. / training_samples) * (np.sum(np.multiply(Y, np.log(AL_nonzero))) + np.sum(np.multiply(1 - Y, np.log(1 - AL_nonzero))))
# Removes unnecessary dimensions from the cost value, so [[0.15]] is converted into 0.15
cost = np.squeeze(cost)
assert (cost.shape == ())
return cost
# Performs backward propagation from one layer to the previous depending on
# the activation function specified.
def activation_backward(dAL, cache, layer, activation_type):
if activation_type == "relu":
dZ = relu_backward(dAL, cache, layer)
elif activation_type == "softmax":
dZ = softmax_backward(cache, layer)
# Accesses values from cache
A_prev = cache["A" + str(layer - 1)]
W = cache["W" + str(layer)]
b = cache["b" + str(layer)]
training_samples = A_prev.shape[1]
dW = (1. / training_samples) * np.dot(dZ, A_prev.T)
db = (1. / training_samples) * np.sum(dZ, axis=1, keepdims=True)
dA_prev = np.dot(W.T, dZ)
assert (dA_prev.shape == A_prev.shape)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
return dA_prev, dW, db
# Performs backward propagation on the whole neural network to find the derivatives of the
# cost function with respect to the parameters Weight and bias of each layer. These derivatives will
# allow us to perform gradient descent on the the cost function, tuning the parameters to improve accuracy.
def back_propagation(cache, Y, L):
gradients = {}
# Accesses output layer matrix from cache
AL = cache["A" + str(L)]
# Derivative of the cross entropy cost function
gradients["dA" + str(L)] = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
# Uses relu_backward for all the hidden layers
for l in reversed(range(1, L + 1)):
activation_type = "relu"
# Uses softmax_backward for the output layer
if l == L:
activation_type = "softmax"
# finds gradients of each layer
dA_previous, dW_current, db_current = activation_backward(gradients["dA" + str(l)], cache, l, activation_type)
# Stores these gradients in the gradients dictionary
gradients["dA" + str(l - 1)] = dA_previous
gradients["dW" + str(l)] = dW_current
gradients["db" + str(l)] = db_current
return gradients
# Updates all of the parameters by subtracting their gradients from them
def update_parameters(gradients, parameters, learning_rate):
L = len(parameters) // 2
for l in range(1, L):
W_layer = "W" + str(l)
b_layer = "b" + str(l)
dW_layer = "dW" + str(l)
db_layer = "db" + str(l)
# The learning rate needs to be tuned to approach the minimum of the cost accurately and reliably
# while optimising time taken to train the neural net
parameters[W_layer] = parameters[W_layer] - (gradients[dW_layer] * learning_rate)
parameters[b_layer] = parameters[b_layer] - (gradients[db_layer] * learning_rate)
return parameters
# Calculates accuracy of the neural net's prediction against the true labels
def calculate_accuracy(parameters, x, y_onehot, size):
# Uses parameters to get the neural net's guess
AL, cache = feed_forward(x, parameters, y_onehot)
# Finds the highest probability character for each training example
AL_max = np.argmax(AL, axis=0)
AL_max = AL_max.reshape(1, AL_max.shape[0])
# Converts the labels from one-hot matrices back into a numerical vector
y_int = np.argmax(y_onehot, axis=0)
y_int = y_int.reshape(1, y_int.shape[0])
# Compares the two
accuracy = np.mean(AL_max == y_int)
return accuracy
# Brings everything together to train the model
def NN_model(X, Y, layer_dims, learning_rate, num_iterations, size, print_values):
# Accesses initial parameters
parameters = initialise_parameters(layer_dims)
# Used to calculate the training time over the iteration
times = []
# Used to plot the accuracies at the end
accuracies = []
times.append(time.time())
for i in range(num_iterations):
# Accesses the output and cache using forward propagation through the parameters
AL, cache = feed_forward(X, parameters, Y)
# Calculates the gradients using the previouslt generated cache
gradients = back_propagation(cache, Y, len(layer_dims) - 1)
# Tunes the parameters to achieve greater accuracy
parameters = update_parameters(gradients, parameters, learning_rate)
# Used to track progress during training
if i % 10 == 0 and print_values:
time_difference = time.time() - times[-1]
times.append(time.time())
print("Time:", time_difference)
cost = compute_cost(AL, Y)
accuracy = calculate_accuracy(parameters, x_test, y_test, size)
print("Cost on test dataset after iteration %i: %f, accuracy: %f" % (i, cost, accuracy))
# costs.append(cost)
accuracies.append(accuracy)
if i % 1000 == 0:
learning_rate = learning_rate * 0.98
# Plots graph of accuracy to mark improvement
plt.xlabel('iterations')
plt.title("Learning rate =" + str(learning_rate))
line, = plt.plot(np.squeeze(accuracies), label='accuracies')
plt.xlabel('iterations x10')
plt.legend()
plt.show()
return parameters
size = 30000
images, labels = extract_training_samples('balanced')
x_train, y_train = getData(size, images, labels)
images, labels = extract_test_samples('balanced')
global x_test
global y_test
x_test, y_test = getData(size, images, labels)
layer_dims = [784, 30, 20, 47]
parameters = initialise_parameters(layer_dims)
print(calculate_accuracy(parameters, x_train, y_train, size))
parameters = NN_model(x_train, y_train, layer_dims, 0.03, 40000, size, print_values=True)
print(calculate_accuracy(parameters, x_train, y_train, size))