我正在尝试完成我的神经网络,该网络使用的是MNIST手写数字数据库,该数据库使用了反向传播算法。 我的代码如下所示-我将尝试对其进行精简。
class NeuralNetwork:
def _init_(...):
#Here I am just assigning some values and also parameters weight and bias
#L - len(number of layers)
for i in range (1, L):
self.parameters["W" + str(i)] = np.random.randn(self.ld[i-1], self.ld[i])*0.01
self.parameters["b" + str(i)] = np.zeros((1, self.ld[i]))
def linear_forward(A_prev, W, b):
#A_prev - activation value from previous layer | W - weight | b - bias
Z = np.dot(A_prev, W) + b
cache = (Z, A_prev, W, b)
#storing some important variables into the cache because of backpropagation
return cache
def linear_activation_forward(A_prev, W, b, activation):
#activation will be a string like "relu", "sigmoid" -> this will determine which activation function will be used
logits = [] #later for softmax
cache = linear_forward(A_prev, W, b)
if activation == "sigmoid":
A, Z = sigmoid(Z)
#.
#.
#.
elif ativation == "softmax":
logits, Z = softmax(Z)
newCache = (Z, A_prev, W, b)
return A, newCache, logits
#I will also include my activation functions sigmoid and softmax, even though there are not needed for understanding my problem
def sigmoid(z):
A = 1/(1+np.exp(-z))
assert(A.shape == z.shape)
zCache = z
return A, zCache
def softmax(z):
exp = np.exp(z)
assert(exp.shape == z.shape)
logits = exp/np.sum(exp, axis = 0, keepdims = True)
return logits, z
def FeedForward(A_prev, parameters, activation):
L = len(parameters) //2
#parameters has weight and bias inside
caches = [] #this will have all my caches inside
for l in range(1, L):
A, cache, _ = linear_activation_forward(A_prev, parameters["W" + str(l)], parameters["b" + str(l)], activation)
caches.append(cache)
Al, cache, logits = linear_activation_forward(A, parameters["W" + str(L)], parameters["b"+str(L)], "softmax")
caches.append(cache)
return caches, Al, logits
def CategoricalCrossEntropy(y, yhat):
#I have been quite a lot struggling with CrossEntropy and I am still not sure, if I am using the right one
#Another version I have found is this one -> loss = -(np.sum(y) * np.sum(np.log(yhat)))
return (yhat-y)
###Backpropagation###
def linear_backward(dZ, linearCache):
A_prev, W, b = linearCache
m = A_prev.shape[1]
#dW, db and dA_prev are derivatives of weight, bias and activation value from previous layer
dW = (1/m)*np.dot(A_prev.T, dZ)
db = (1/m)*np.sum(dZ, axis = 0, keepdims = True)
dA_prev = np.dot(dZ, W.T)
return dA_prev, dW, db
def linear_activation_backward(dA, currentCache, activation):
Z, A_prev, W, b = current_cache
linearCache = (A_prev, W, b)
if activation == "sigmoid":
dZ = sigmoid_backward(dA, Z)
#.
#.
#.
elif activation == "softmax":
dZ = softmax_backward(dA, Z)
dA_prev, dW, db = linear_backward(dZ, linearCache)
return dA_prev, dW, db
def sigmoid_backward(dA, Z):
A, Z = sigmoid(Z)
dZ = ((1-A)*dA)
return dZ
def softmax_backward(dA, Z):
#x, _ = softmax(dA)
#s = x.reshape(-1,1)
#dZ = np.diagflat(s) - np.dot(s, s.T)
J = - dA[..., None] * dA[:, None, :] # off-diagonal Jacobian
iy, ix = np.diag_indices_from(J[0])
J[:, iy, ix] = dA * (1. - dA) # diagonal
return J.sum(axis=1)
def Backpropagation(dAl, caches, activation):
#dAl - the derivative of the activation value from last layer
L = len(parameters) //2
current_cache = caches[L-1]
grads = {}
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = self.linear_activation_backward(dAl, current_cache, "softmax")
for l in reversed(range(L-1)):
current_cache = caches[l]
grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = self.linear_activation_backward(grads["dA" + str(l+1)], current_cache, activation)
return grads
def Gradient_Descent(grads, parameters, learning_rate):
L = len(parameters)//2
for l in range(L):
self.parameters["W" + str(l+1)] = self.parameters["W" + str(l+1)] - grads["dW" + str(l+1)]*learning_rate
self.parameters["b" + str(l+1)] = self.parameters["b" + str(l+1)] - grads["db" + str(l+1)]*learning_rate
return parameters
def train(X, Y):
A = X
labels = Y
grads = {}
parameters
learning_rate = 0.01
for epochs in range(self.epochs):
A_prev = A
caches, Al, logits = Feedforward(A_prev, parameters, self.activation)
loss = CategoricalCrossEntropy(labels, softmax(Al))
dA = np.divide(Al - Y, np.divide(Al, 1-Al))
dAl = -(np.divide(Y, Al) - np.divide(1 - Y, 1 - Al))
#not sure which one to choose, both of them makes loss NaN
grads = Backpropagation(dAl, caches, self.activation)
optimizer == "gradientDescent":
parameters = Gradient_Descent(grads, parameters, learning_rate)
这一切都是这样的损失:
Loss: [[[ 1.00582113e-04 9.72306300e-05 1.01261971e-04 ... -9.99894497e-01
1.11269120e-04 9.89210011e-05]
[ 1.16420796e-04 1.11381422e-04 -9.99906525e-01 ... 9.33883929e-05
1.10171812e-04 1.03851887e-04]
[ 1.01710476e-04 -9.99897105e-01 9.09620320e-05 ... 1.01604678e-04
1.00675631e-04 1.07537724e-04]
这绝对是错误的,经过几次迭代后,它输出以下内容:
Loss: [[[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
...
[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]]
哪一个更错误
有人可以告诉我,我做错了什么吗? 如果需要,我可以将您的整个文件发送给您
感谢您的帮助!