这是从中等职位获得的代码。我是神经网络的新手,我希望有人向我解释(如果可以的话,不要只是给我一个答案,请解释一下)如何将代码中提供的批量大小更改为SGD批量大小。预先谢谢你!
代码
NeuralNetwork(object)类:
# The constructor takes
# 1. layers representing the number of nodes
# 2. activations representing the activation functions to choose in
# each layer.
def __init__(self, layers = [2 , 10, 1], activations=['sigmoid', 'sigmoid']):
# check to make sure that no. of layers is one more than
# no. of activation functions because the input layer
# has no activation function.
assert(len(layers) == len(activations)+1)
# define the local variables layers and activations
self.layers = layers
self.activations = activations
# initialize weights and biases as two lists to hold
# weights and biases for each layer
self.weights = []
self.biases = []
# create random weights for biases and weights
# for each of the layes.
for i in range(len(layers)-1):
self.weights.append(np.random.randn(layers[i+1], layers[i]))
self.biases.append(np.random.randn(layers[i+1], 1))
# do feedforward for x
# where x is an input
# return a list of a's and z's as expected.
# a's and z's are layer by layer
def feedforward(self, x):
# make a copy of x
a = np.copy(x)
# this variable will contain all the z's
z_s = []
# this variable will contain all the a's
# the output of the input layer is simply x which is the
# input. So we initialize the a_s to contain a.
a_s = [a]
# for each layer do
for i in range(len(self.weights)):
# retrieve the appropriate activation function
activation_function = self.getActivationFunction(self.activations[i])
# create z_s by z = w.a + b for each layer
z_s.append(self.weights[i].dot(a) + self.biases[i])
# create z_a by a = f(row) --
# note that we apply it to the last element only
# by using the -1 notation. We only want to apply
# the activation function to the last layer just
# added.
a = activation_function(z_s[-1])
# keep track of the new activation or a_s
a_s.append(a)
# return both z_s and a_s
# we will have z_s and a_s for each layer
return (z_s, a_s)
# takes the y -- the actual answer, a's and z's and
# calculates the dLoss/dw and dLoss/db
def backpropagation(self,y, z_s, a_s, normalize='no',ignore_b='no'):
# initialize list of dLoss/dw and dLoss/db
dw = [] # dLoss/dw
db = [] # dLoss/db
# create an empty list of deltas, one for each weight
deltas = [None] * len(self.weights) # delta = dLoss/dz known as error for each layer
# start from the back and insert the last layer error
# based on the square loss function. Note -1 is used to
# fill things from the back of the list
# also note that we need to use the derivative function
# for the activation function.
# note that we do not need to use the 2 in the loss function derivation
# again note this is for the last layer only!
deltas[-1] = ((y-a_s[-1])*(self.getDerivitiveActivationFunction(self.activations[-1]))(z_s[-1]))
# Perform BackPropagation
# for the rest of the deltas, go in reverse order
for i in reversed(range(len(deltas)-1)):
deltas[i] = self.weights[i+1].T.dot(deltas[i+1])*(self.getDerivitiveActivationFunction(self.activations[i])(z_s[i]))
# now we need to update the weights based on the calculated
# deltas
#now we will determine the batch size from the first dimension
#of shape of y. We simply want to see how many test cases are there
#for example there may be 10 y's; one for each x.
batch_size = y.shape[1]
# determine the two derivatives by taking
# the average according to batch sizes
db = [d.dot(np.ones((batch_size,1)))/float(batch_size) for d in deltas]
dw = [d.dot(a_s[i].T)/float(batch_size) for i,d in enumerate(deltas)]
#for i,d in enumerate(dw):
# print("dw[",i,"]=", d,"\n")
#Q3 normalize if normalize is anything but no.
if(normalize!='no'):
dw = [d/np.linalg.norm(d) for i,d in enumerate(dw)]
#for i,d in enumerate(dw):
# print("dwn[",i,"]=", d,"\n")
#Q4 ignore b's
if(ignore_b!='no'):
db = [db*0 for i,d in enumerate(db)]
#for i,d in enumerate(dw):
# print("dwn[",i,"]=", d,"\n")
# return the derivitives respect to weight matrix and biases
return dw, db
# Now we will write the main training function that uses
# feedforward and backpropagation many times (called epochs)
# lr (learning rate) is the eta in our equations.
def train(self, x, y, batch_size=10, epochs=100, lr = 0.01, normalize='no', ignore_b='no'):
# create an array to store the loss for Q1
saved_loss = np.zeros((epochs,2))
# update weights and biases based on the output
# for the number of epochs
for e in range(epochs):
i=0
# Do the training in batches
# each batch is a subset of the original
# data
while(i<len(y)):
# extract a batch
x_batch = x[i:i+batch_size]
y_batch = y[i:i+batch_size]
# update i for the next batches
i = i+batch_size
# do the feedforward for the batch and update the weights
# based on the average loss for each weight for the whole
# batch.
z_s, a_s = self.feedforward(x_batch)
# do the back propagation
dw, db = self.backpropagation(y_batch, z_s, a_s, normalize)
# update the weights for each pair of weights and dw
# and biases and db
self.weights = [w+lr*dweight for w,dweight in zip(self.weights, dw)]
self.biases = [w+lr*dbias for w,dbias in zip(self.biases, db)]
# print the loss using a built in function
# to calculate the loss
# calculate loss
loss = np.linalg.norm(a_s[-1]-y_batch)
saved_loss[e,0] = e
saved_loss[e,1] = loss
#print("loss = ", np.linalg.norm(a_s[-1]-y_batch) )
# for Q1
return saved_loss
# This function is being used to return an activation function
# depending on its weights
@staticmethod
def getActivationFunction(name):
if(name == 'sigmoid'):
return lambda x : np.exp(x)/(1+np.exp(x))
elif(name == 'linear'):
return lambda x : x
elif(name == 'relu'):
def relu(x):
y = np.copy(x)
y[y<0] = 0
return y
return relu
# Q2
elif(name == 'leaky_relu'):
def leaky_relu(x):
y = np.copy(x)
y[y<0] *= 0.01
return y
return leaky_relu
else:
print('Unknown activation function. linear is used')
return lambda x: x
# This function returns the derivative of a function depending
# on its name.
@staticmethod
def getDerivitiveActivationFunction(name):
if(name == 'sigmoid'):
sig = lambda x : np.exp(x)/(1+np.exp(x))
return lambda x :sig(x)*(1-sig(x))
elif(name == 'linear'):
return lambda x: 1
elif(name == 'relu'):
def relu_diff(x):
y = np.copy(x)
y[y>=0] = 1
y[y<0] = 0
return y
return relu_diff
#Q2
elif(name=='leaky_relu'):
def leaky_relu_diff(x):
dx = np.ones_like(x)
dx[x < 0] = 0.01
return dx
return leaky_relu_diff
else:
print('Unknown activation function. linear is used')
return lambda x: 1