我是ML的初学者,并尝试在Python中实现具有S形神经元和二次成本函数的神经网络。它对于简单的1输入情况很好用,但对于更复杂的情况(如涉及更多输入的XOR)却可以解决。输出只是目标值的平均值。例如,当我尝试在“与”门上训练网络时,该门接收输入[[0,0],[0,1],[1,0],[1,1]],并目标输出[0,0 [,0,1],输出收敛到0.25左右的平均值。在训练具有多个输入的其他测试时,我遇到了同样的问题。我高度怀疑问题的所在是权重和偏差梯度的累积。有人可以帮我找到我的代码有什么问题吗?
import numpy as np
class Network:
def __init__(self, layers):
self.layers = layers
# activations for each layer
self.activations = self.initialize_layers()
# bias for each layer
self.bias = self.initialize_layers()
# weights stored as a matrix for each layer
self.weights = self.initialize_weights(is_random = False)
# sum of weighted inputs and bias for each layer
self.zs = self.initialize_layers()
def initialize_layers(self):
return list(map(
lambda current_layer:
np.zeros(current_layer),
self.layers[1:]
))
def initialize_weights(self, is_random):
weights = []
for index, current_layer in enumerate(self.layers):
if index == 0:
continue
weights.append(
np.random.rand(
current_layer,
self.layers[index - 1]
) if is_random
else np.zeros((current_layer, self.layers[index - 1]))
)
return weights
def forward(self, inputs, layer_number):
if layer_number >= len(self.activations):
return
else:
if layer_number == 0:
previous_activations = inputs
else:
previous_activations = self.activations[layer_number - 1]
current_weights = self.weights[layer_number]
current_bias = self.bias[layer_number]
z = np.dot(current_weights, np.transpose(previous_activations)) + current_bias
self.zs[layer_number] = z
self.activations[layer_number] = sigmoid(z)
self.forward(inputs, layer_number + 1)
def backward(self, targets, sigmas, weight_gradients, bias_gradients, layer_number):
if layer_number == len(self.activations) - 1:
weight_gradients = self.initialize_weights(is_random = False)
bias_gradients = self.initialize_layers()
sigmas[-1] = cost_derivative(self.activations[-1], targets) * sigmoidPrime(self.zs[-1])
elif layer_number < 0:
return
else:
sigmas[layer_number] = (
np.dot(np.transpose(self.weights[layer_number + 1]), sigmas[layer_number + 1])
* sigmoidPrime(self.zs[layer_number])
)
bias_gradients[layer_number] = sigmas[layer_number]
weight_gradient = []
for sigma in sigmas[layer_number]:
row = []
for activation in self.activations[layer_number - 1]:
row.append(sigma * activation)
weight_gradient.append(row)
weight_gradients[layer_number] = weight_gradient
self.backward(targets, sigmas, weight_gradients, bias_gradients, layer_number - 1)
return (weight_gradients, bias_gradients)
def gradient_descent(self, weight_gradients, bias_gradients, learning_rate, input_length):
def gradient_helper(gradient_tuple):
current_value = gradient_tuple[0]
gradient = gradient_tuple[1]
return current_value - (learning_rate / input_length) * gradient
def apply_gradients(values, gradients):
return list(map(
gradient_helper,
zip(values, gradients)
))
self.weights = apply_gradients(self.weights, weight_gradients)
self.bias = apply_gradients(self.bias, bias_gradients)
def iterate(self, training_data, learning_rate, epochs):
for i in range(epochs):
sigmas = self.initialize_layers()
weight_gradients = self.initialize_weights(is_random = False)
bias_gradients = self.initialize_layers()
for row in training_data:
input_data = row[0]
target_data = row[1]
self.forward(input_data, 0)
(delta_weight_gradients, delta_bias_gradients) = self.backward(target_data, sigmas, None, None, len(self.activations) - 1)
weight_gradients = [gradient + delta_gradient for (gradient, delta_gradient) in zip(weight_gradients, delta_weight_gradients)]
bias_gradients = [gradient + delta_gradient for (gradient, delta_gradient) in zip(bias_gradients, delta_bias_gradients)]
self.gradient_descent(weight_gradients, bias_gradients, learning_rate, len(input_data))
def predict(self, inputs):
predictions = []
for input_data in inputs:
self.forward(input_data, 0)
predictions.append(self.activations[-1])
return predictions
def cost_derivative(activation, y):
return activation - y
def sigmoid(matrix):
return 1 / (1 + np.exp(-matrix))
def sigmoidPrime(matrix):
sig = sigmoid(matrix)
return sig * (1 - sig)
net = Network([2, 2, 1])
# print(net.activations[-1])
training_data = [
([0, 0], [0]),
([0, 1], [1]),
([1, 0], [1]),
([1, 1], [0])
]
net.iterate(training_data, learning_rate = 0.5, epochs = 1000)
print(net.predict(map(lambda data: data[0], training_data)))