我一直在尝试在Python 3.6中创建一个vanilla 3层RNN,但每当我训练它时,训练损失会在第一次X训练迭代次数减少,然后偶尔上下移动,我可以'找出问题所在。我现在不想使用Tensorflow或Keras或任何深度学习框架,因为我试图更好地理解这些NN是如何工作的。
我非常确定我的问题在于我在前向道具函数中添加矩阵的方式,或者在我的反向函数的获取错误和重量更新部分,所以我将在下面发布它们。
很抱歉,如果我的代码有点乱,但这是我的功能,通过我的神经网络转发道具。
def RNN_backprop(self, C_real):
#did this so I could transpose a 1D matrix
BT = self.B[np.newaxis]
#get error of BC weight matrix
delta3 = np.multiply(-(C_real-self.C), self.sigmoidPrime(self.C1))
BCp = np.dot(BT.T, delta3)
BCpT = BCp[np.newaxis]
#get error of AB weight matrix
delta2 = np.dot(delta3, self.BC.T)*self.sigmoidPrime(self.B1)
AT, d2 = self.A[np.newaxis], delta2[np.newaxis]
ABp = np.dot(AT.T, d2)
#get error of weight matrix linked to previous time step
delta2t = np.dot(delta3, self.BC.T)*self.sigmoidPrime(self.B_t1)
B_RNNT, d2t = self.B_t[np.newaxis], delta2t[np.newaxis]
B_RNNp = np.dot(B_RNNT.T, d2t)
#training speed
weight_multiplier = 10
#update weight matricies
self.BC = self.BC - BCpT.T*weight_multiplier
self.AB = self.AB - ABp*weight_multiplier
self.B_RNN = self.B_RNN - B_RNNp*weight_multiplier
#store layer 2 to be used in next timestep
self.B_t = self.B
我的backprop和更新权重矩阵函数被合并为一个函数,抱歉一切都太乱了
def __init__(self):
self.A_size = 4
self.B_size = 6
self.C_size = 1
def init_weights(self):
self.AB = np.random.randn(self.A_size, self.B_size)
self.BC = np.random.randn(self.B_size, self.C_size)
self.B_t = [0]
self.B_RNN = np.random.randn(self.B_size, self.B_size)
def sigmoid(self, X1):
return 1/(1+np.exp(-X1))
def sigmoidPrime(self,X1):
return np.exp(-X1)/((1+np.exp(-X1))**2)
def print_status(self, NN_type):
print("---------Status---------")
print("A: ", self.A)
print("AB: ", self.AB)
print("B: ", self.B)
print("BC: ", self.BC)
print("C: ", self.C)
if NN_type == "RNN":
print("Previous B: ", self.B_t)
if NN_type == "LSTM":
pass
#print("Error: ", self.cost)
print("---------Done---------")
def RNN_forward(self, A):
self.A = A
self.B1 = np.dot(self.A, self.AB)
if len(self.B_t) > 2:
self.B_t1 = np.dot(self.B_t, self.B_RNN)
self.B = self.sigmoid((self.B1) + (self.B_t1))
#self.B = self.sigmoid(np.tanh(self.B1) + np.tanh(self.B_t1))
else:
self.B = self.sigmoid(self.B1)
self.B_t = self.B
self.B_t1 = np.dot(self.B_t, self.B_RNN)
print('this should only print once')
self.C1 = np.dot(self.B, self.BC)
self.C = self.sigmoid(self.C1)
return self.C
def skip_backprop(self):
self.B_t = self.B
def get_cost(self, C_real):
#self.cost = 0.5*sum((C_real-self.C)**2)
self.cost = 0.5*((C_real-self.C)**2)
return self.cost
def RNN_backprop(self, C_real):
BT = self.B[np.newaxis]
delta3 = np.multiply(-(C_real-self.C), self.sigmoidPrime(self.C1))
BCp = np.dot(BT.T, delta3)
BCpT = BCp[np.newaxis]
delta2 = np.dot(delta3, self.BC.T)*self.sigmoidPrime(self.B1)
AT, d2 = self.A[np.newaxis], delta2[np.newaxis]
ABp = np.dot(AT.T, d2)
delta2t = np.dot(delta3, self.BC.T)*self.sigmoidPrime(self.B_t1)
B_RNNT, d2t = self.B_t[np.newaxis], delta2t[np.newaxis]
B_RNNp = np.dot(B_RNNT.T, d2t)
#Important
#weight_multiplier = 5 * (np.sum(np.absolute(ABp))+np.sum(np.absolute(BCpT))+np.sum(np.absolute(B_RNNp)))
weight_multiplier = 0.01
self.BC = self.BC - BCpT.T*weight_multiplier
self.AB = self.AB - ABp*weight_multiplier
self.B_RNN = self.B_RNN - B_RNNp*weight_multiplier
self.B_t = self.B
NN = NN()
NN.init_weights()
#important
iterations = 100000
for a in range(iterations):
total_error = 0
for i in range(50):
NN.RNN_forward(np.array(As[i]))
NN.RNN_backprop(C_reals[i])
total_error += NN.get_cost(C_reals[i])
if a%500 == 0:
print("Error: ",total_error)
NN.RNN_forward(np.array([1,1,1,0.2]))
NN.skip_backprop()
print("0.4: ", NN.RNN_forward(np.array([1,1,1,0.3])))
NN.RNN_forward(np.array([1,1,1,0.4]))
NN.skip_backprop()
print("0.2: ", NN.RNN_forward(np.array([1,1,1,0.3])))
我很高兴发布整个代码或任何其他人可能需要帮助诊断我的问题的位。
提前感谢您提出的任何建议或潜在解决方案!
编辑:这是完整的代码:它的评论非常糟糕,所以它很难阅读,但这一切都在这里。有些行被注释掉,因为我不确定我是否会尝试使用它们。
将numpy导入为np
类NN(对象):
#div {
position: fixed;
top: 50%;
left: 50%;
transform: translate(-50%, -50%);
}
答案 0 :(得分:0)
消除此类行为的一种常见方法是在继续训练时降低学习率。 虽然我无法解释,但添加批量规范化帮助我在一个案例中解决了这个问题。但是我可以在增加网络容量之后摆脱它。