我是神经网络主题的新手,我正在学习python。
我正在尝试教一个神经网络来区分数字和mnist数据库。我有计算梯度偏差的问题。我认为它与f上的渐变相同(在应用重量矩阵之后但在计算sigmoid之前的输出),但似乎我错了 - 数字计算它给出了另一个结果。
在10000个时代之后,无论我使用哪种偏差梯度,我的网络都具有10%的准确度。
你能告诉我我的代码有什么问题(在反向传播功能中)吗?
我的代码:
# coding: utf-8
import random
import numpy as np
#import pandas as pd
from tensorflow.examples.tutorials.mnist import input_data
# Let's read the mnist dataset
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# In this exercise your task is to fill in the gaps in this code by implementing the backpropagation algorithm
# Once this is done, you can run the network on the MNIST example and see how it performs. Feel free to play with the parameters.
#
def Loss_function(x,y):
#takes 2 vertical vectors, output: half of sum of squares of differences
return 0.5*(np.dot(np.transpose(np.array(x)-np.array(y)),np.array(x)-np.array(y)))
def sigmoid(z):
return 1.0/(1.0+np.exp((-1)*(np.array(z))))
def sigmoid_prime(z):
# Derivative of the sigmoid
return sigmoid(np.array(z))*(1-sigmoid(np.array(z)))
class Network(object):
def __init__(self, sizes):
# initialize biases and weights with random normal distr.
# weights are indexed by target node first
##size
self.num_layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x)
for x, y in zip(sizes[:-1], sizes[1:])]
#self.weights are transfer matrices to go from one layer to next one (then apply sigmoid)
def feedforward(self, a):
# Run the network on a single case
for b, w in zip(self.biases, self.weights):
a = sigmoid(np.dot(w, a)+b)
return a
def feedforwardN(self, B, W, a):
# Run the network on a single case
for b, w in zip(B, W):
a = sigmoid(np.dot(w, a)+b)
return a
def update_mini_batch(self, mini_batch, eta):
# Update networks weights and biases by applying a single step
# of gradient descent using backpropagation to compute the gradient.
# The gradient is computed for a mini_batch which is as in tensorflow API.
# eta is the learning rate
print("update mini batch")
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
print(len(mini_batch[0]))
print(len(mini_batch[1]))
number=1
print(number)
for x, y in zip(mini_batch[0],mini_batch[1]):
delta_nabla_b, delta_nabla_w = self.backprop(x.reshape(784,1), y.reshape(10,1))
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
number=number+1
print(number)
print("we are outside the loop now")
self.weights = [w-(eta/len(mini_batch[0]))*nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch[0]))*nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
# For a single input (x,y) return a pair of lists.
# First contains gradients over biases, second over weights.
# First initialize the list of gradient arrays
# TODO
gradient_b = [np.zeros(b.shape) for b in self.biases]
gradient_w = [np.zeros(w.shape) for w in self.weights]
gradient_bN= gradient_b
gradient_wN= gradient_w
# Then go forward remembering all values before and after activations
# in two other array lists
# TODO
# f[i+1] output after apply matrix of weight to g[i]
# g[i] output after applying sigmoid to f[i]
f=[np.zeros(s) for s in self.sizes]
g=[np.zeros(s) for s in self.sizes]
f[0]=x
i=1
g[0]=f[0]
for b, w in zip(self.biases, self.weights):
f[i]=(np.dot(w, g[i-1])+b)
g[i]=sigmoid(f[i])
i=i+1
#f[0] i g[0] are not really needed
# Now go backward from the final cost applying backpropagation
# TODO
gradient_g=g
gradient_f=f
n=len(g)
gradient_g[n-1]=(np.array(g[n-1])-np.array(y))
gradient_f[n-1]=np.multiply(np.multiply(gradient_g[n-1], g[n-1]), (1-g[n-1]))
for i in range(n-2, 0, -1):
gradient_g[i]=np.dot(np.transpose(np.array(self.weights[i])), gradient_f[i+1])
gradient_f[i]=np.multiply(np.multiply(gradient_g[i], g[i]), (1-g[i]))
gradient_b=gradient_f[1:]
for i in range(0,len(gradient_w)):
gradient_w[i]=np.outer(gradient_f[i+1], g[i])
#numerical gradient
epsilon=0.000001
bN = [np.array(M) for M in self.biases]
predY= self.feedforwardN(self.biases, self.weights, x)
predL = Loss_function(predY,y)
for i in range(0, len(self.biases)):
for j in range(0, len(self.biases[i])):
bN = [np.array(M) for M in self.biases]
bN[i][j] = bN[i][j] + epsilon
predYE= self.feedforwardN(bN, self.weights, x)
gradient_bN[i][j]=(Loss_function(predYE,y)- Loss_function(predY,y))/epsilon
differenceB=0.0
rI=0
rJ=0
for i in range(0, len(gradient_b)):
for j in range(0, len(gradient_b[i])):
if (max(np.absolute(differenceB), np.absolute(gradient_bN[i][j]- gradient_b[i][j])) > differenceB):
rI=i
rJ=j
differenceB= max(np.absolute(differenceB), np.absolute(gradient_bN[i][j]- gradient_b[i][j]))
print ("maximum over entries of difference of numerical and usual gradient of B")
print(differenceB)
print("in entry rI rJ")
print(rI,rJ)
print(gradient_b[rI][rJ], gradient_bN[rI][rJ])
return gradient_b,gradient_w
def evaluate(self, test_data):
# Count the number of correct answers for test_data
test_results = [(np.argmax(self.feedforward(test_data[0][i].reshape(784,1))), np.argmax(test_data[1][i]))
for i in range(len(test_data[0]))]
#print test_results
return sum(int(x == y) for (x, y) in test_results)
def cost_derivative(self, output_activations, y):
return (output_activations-y)
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
for j in xrange(epochs):
print("epoch nr")
print(j)
self.update_mini_batch(training_data.next_batch(mini_batch_size), eta)
if test_data:
print "Epoch {0}: {1} / {2}".format(
j, self.evaluate(test_data.next_batch(mini_batch_size)), mini_batch_size)
else:
print "Epoch {0} complete".format(j)
network = Network([784,30,10])
network.SGD(mnist.train,epochs=10000,mini_batch_size=100,eta=3.0,test_data=mnist.test)