因此,我正在使用numpy从头开始构建RNN,以了解其内部工作原理。我的时间向后传播在这里:
def backprop_through_time(self, X, Y):
assert(len(X.shape) == 3)
seq_length = Y.shape[1] if self.return_sequences else 1
_, (Z_states, States, Z_outs, Outs) = self.feed_forward(X, cache=True)
if not self.return_sequences:
Outs = Outs[:,-1,:]
# setup gradients
dLdU = np.zeros(self.U.shape)
dLdV = np.zeros(self.V.shape)
dLdW = np.zeros(self.W.shape)
dLdB_state = np.zeros(self.B_state.shape)
dLdB_out = np.zeros(self.B_out.shape)
dLdOuts = self.loss_function_prime(Outs, Y)
if not self.return_sequences:
# we need dLdOuts to have a seq_length dim at axis 1
dLdOuts = np.expand_dims(dLdOuts, axis=1)
for t in range(seq_length):
adjusted_t = seq_length-1 if not self.return_sequences else t
# print("adjusted_t {}".format(adjusted_t))
dOuts_tdZ_out = self.output_activation_function_prime(Z_outs[:,adjusted_t,:])
dLdZ_out = np.multiply(dLdOuts[:, adjusted_t, :], dOuts_tdZ_out)
# Z_state = dot(X_t, self.U) + dot(State_{t-1}, self.W) + self.B_state
# State_t = f(Z_state)
# Z_out = dot(State_t, self.V) + self.B_out
# Out_t = g(Z_out)
dLdV += np.dot(States[:,adjusted_t,:].T, dLdZ_out)
dLdB_out += np.sum(dLdZ_out, axis=0, keepdims=True)
dLdZ_state = np.multiply(np.dot(dLdZ_out, self.V.T),
self.hidden_activation_function_prime(Z_states[:,adjusted_t,:]))
for t_prev in range(max(0, adjusted_t-self.backprop_through_time_limit), adjusted_t+1)[::-1]:
dLdB_state += np.sum(dLdZ_state, axis=0, keepdims=True)
dLdW += np.dot(States[:,t_prev-1,:].T, dLdZ_state)
dLdU += np.dot(X[:,t_prev,:].T, dLdZ_state)
dLdZ_state = np.multiply(np.dot(dLdZ_state, self.W.T),
self.hidden_activation_function_prime(States[:,t_prev-1,:]))
return (dLdU, dLdV, dLdW), (dLdB_state, dLdB_out)
但是,我仍然无法对参数dLdU,dLdW,dLdB_state进行梯度检查。我现在已经进行了大约十二次数学运算,但我找不到实现的问题。
我假设X和Y都是3D矩阵,且X的形状为:X.shape := (batch_size, seq_length, input_dim)
而Y的形状为:Y.shape := (batch_size, seq_length, output_dim)
缓存feed_forward操作时,我将返回形状为Z_states.shape := (batch_size, seq_length, hidden_dim)
的Z_states,形状为Z_outs.shape, Outs.shape := (batch_size, seq_length, output_dim)
的Z_outs和Outs以及状态为States.shape := (batch_size, seq_length+1, hidden_dim)
的状态。 States [:,-1 ,:]是形状States[:,-1,:].shape := (batch_size, hidden_dim)
的原始零,用于初始化RNN状态。有人可以帮我吗?
编辑
我找到了答案。我的数学正确,但是我调用了错误的变量。当我在第二个内部循环(通过时间的反向传播)中更新dLdZ_state时,我将与self.hidden_activation_function_prime(States[:,t_prev-1,:])
相乘,而应改为self.hidden_activation_function_prime(Z_states[:,t_prev-1,:])