Question

我在tensorflow中具有以下模型：

def output_layer(input_layer, num_labels):
    '''
    :param input_layer: 2D tensor
    :param num_labels: int. How many output labels in total? (10 for cifar10 and 100 for cifar100)
    :return: output layer Y = WX + B
    '''
    input_dim = input_layer.get_shape().as_list()[-1]
    fc_w = create_variables(name='fc_weights', shape=[input_dim, num_labels],
                            initializer=tf.uniform_unit_scaling_initializer(factor=1.0))
    fc_b = create_variables(name='fc_bias', shape=[num_labels], initializer=tf.zeros_initializer())

    fc_h = tf.matmul(input_layer, fc_w) + fc_b
    return fc_h

def model(input_features):

    with tf.variable_scope("GRU"):
        cell1 = tf.nn.rnn_cell.GRUCell(gru1_cell_size)

        cell2 = tf.nn.rnn_cell.GRUCell(gru2_cell_size)

        mcell = tf.nn.rnn_cell.MultiRNNCell([cell1, cell2], state_is_tuple=False)

        # shape=(?, 64 + 32) 
        initial_state = tf.placeholder(shape=[None, gru1_cell_size + gru2_cell_size], dtype=tf.float32, name="initial_state")
        output, new_state = tf.nn.dynamic_rnn(mcell, input_features, dtype=tf.float32, initial_state=initial_state)

    with tf.variable_scope("output_reshaped"):
        # before, shape: (34, 1768, 32), after, shape: (34 * 1768, 32)
        output = tf.reshape(output, shape=[-1, gru2_cell_size])

    with tf.variable_scope("output_layer"):
        # shape: (34 * 1768, 3)
        predictions = output_layer(output, num_labels)
        predictions = tf.reshape(predictions, shape=[-1, 100, 3])
    return predictions, initial_state, new_state, output

因此，从代码中我们可以看到，第一个GRU的单元大小为64，第二个GRU的单元大小为32。批处理大小为34（但这对我而言并不重要）。输入特征的大小为200。我尝试通过以下方法计算相对于可训练变量的损耗梯度：

local_grads_and_vars = optimizer.compute_gradients(loss, tf.trainable_variables())
# only the gradients are taken to add them later with the back propagated gradients from previous batch.
local_grads = [grad for grad, var in local_grads_and_vars]

for v in local_grads:
    print("v", v)

打印完毕业证书后，我得到以下信息：

v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/MatMul/Enter_grad/b_acc_3:0", shape=(264, 128), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/BiasAdd/Enter_grad/b_acc_3:0", shape=(128,), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/MatMul_1/Enter_grad/b_acc_3:0", shape=(264, 64), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer1/rnn/while/gru_cell/BiasAdd_1/Enter_grad/b_acc_3:0", shape=(64,), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/MatMul/Enter_grad/b_acc_3:0", shape=(96, 64), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/BiasAdd/Enter_grad/b_acc_3:0", shape=(64,), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/MatMul_1/Enter_grad/b_acc_3:0", shape=(96, 32), dtype=float32)
v Tensor("Optimizer/gradients/GRU_Layer2/rnn/while/gru_cell/BiasAdd_1/Enter_grad/b_acc_3:0", shape=(32,), dtype=float32)
v Tensor("Optimizer/gradients/output_layer/MatMul_grad/tuple/control_dependency_1:0", shape=(32, 3), dtype=float32)
v Tensor("Optimizer/gradients/output_layer/add_grad/tuple/control_dependency_1:0", shape=(3,), dtype=float32)

假设在第一批训练完模型后，即在输入了形状为(34, 100, 200)的{{1}}张量后，保存了梯度，并输出了形状input_features，如何在第二个迷你批处理中反向传播这些梯度？

Answer 1

摘自tf.gradients

的文档

grad_ys是与ys相同长度的张量列表，其中包含ys中每个y的初始梯度。当grad_ys为None时，我们为ys中的每个y填充一个y形为'1'的张量。用户可以提供自己的初始grad_ys来为每个y使用不同的初始梯度来计算导数（例如，如果要为每个y中的每个值不同地加权梯度）。

因此，您的grad_ys应该是长度与输入ys相同的列表。

复制您的代码后，我可以运行以下代码：

prev_grad_pl = [tf.placeholder(tf.float32, [batch, i]) for i in [64, 32]]
prev_grad_init = {l: np.ones(l.get_shape().as_list()) for l in prev_grad_pl}
prev_grads_val__ = tf.gradients([new_state1, new_state2], [initial_state1, initial_state2], grad_ys=prev_grad_pl)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    feed = {initial_state1: np.zeros([batch, gru1_cell_size]),
            initial_state2: np.zeros([batch, gru2_cell_size])}

    for k in prev_grad_init:
        feed[k] = prev_grad_init[k]

    grad1, grad2 = sess.run(prev_grads_val__, feed_dict=feed)

Answer 2

以下是带有自定义代码的解决方案：

df = pd.DataFrame([[75, 100, 65],
                   [25, 25, 30],
                   [55, 90, 45],
                   [55, 90, 75]])

df.mask((df < 50).any(axis=1), df.mean(axis=1), axis=0, inplace=True)

print(df)

           0           1          2
0  75.000000  100.000000  65.000000
1  26.666667   26.666667  26.666667
2  63.333333   63.333333  63.333333
3  55.000000   90.000000  75.000000

这是一个2个GRU接一个的示例，我根据import tensorflow as tf import numpy as np cell_size = 32 seq_length = 1000 time_steps1 = 500 time_steps2 = seq_length - time_steps1 x_t = np.arange(1, seq_length + 1) x_t_plus_1 = np.arange(2, seq_length + 2) tf.set_random_seed(123) m_dtype = tf.float32 input_1 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps1, 1], name="input_1") input_2 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps2, 1], name="input_2") labels1 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps1, 1], name="labels_1") labels2 = tf.placeholder(dtype=m_dtype, shape=[None, time_steps2, 1], name="labels_2") labels = tf.concat([labels1, labels2], axis=1, name="labels") def model(input_feat1, input_feat2): with tf.variable_scope("GRU"): cell1 = tf.nn.rnn_cell.GRUCell(cell_size) cell2 = tf.nn.rnn_cell.GRUCell(cell_size) initial_state = tf.placeholder(shape=[None, cell_size], dtype=m_dtype, name="initial_state") with tf.variable_scope("First50"): # output1: shape=[1, time_steps1, 32] output1, new_state1 = tf.nn.dynamic_rnn(cell1, input_feat1, dtype=m_dtype, initial_state=initial_state) with tf.variable_scope("Second50"): # output2: shape=[1, time_steps2, 32] output2, new_state2 = tf.nn.dynamic_rnn(cell2, input_feat2, dtype=m_dtype, initial_state=new_state1) with tf.variable_scope("output"): # output shape: [1, time_steps1 + time_steps2, 32] => [1, 100, 32] output = tf.concat([output1, output2], axis=1) output = tf.reshape(output, shape=[-1, cell_size]) output = tf.layers.dense(output, units=1) output = tf.reshape(output, shape=[1, time_steps1 + time_steps2, 1]) with tf.variable_scope("outputs_1_2_reshaped"): output1 = tf.slice(input_=output, begin=[0, 0, 0], size=[-1, time_steps1, -1]) output2 = tf.slice(input_=output, begin=[0, time_steps1, 0], size=[-1, time_steps2, 1]) print(output.get_shape().as_list(), "1") print(output1.get_shape().as_list(), "2") print(output2.get_shape().as_list(), "3") return output, output1, output2, initial_state, new_state1, new_state2 def loss(output, output1, output2, labels, labels1, labels2): loss = tf.reduce_sum(tf.sqrt(tf.square(output - labels))) loss1 = tf.reduce_sum(tf.sqrt(tf.square(output1 - labels1))) loss2 = tf.reduce_sum(tf.sqrt(tf.square(output2 - labels2))) return loss, loss1, loss2 def optimize(loss, loss1, loss2, initial_state, new_state1, new_state2): with tf.name_scope('Optimizer'): update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer = tf.train.AdamOptimizer(learning_rate=0.001) grads1 = tf.gradients(loss2, new_state1) grads2 = tf.gradients(loss1, initial_state) grads3 = tf.gradients(new_state1, initial_state, grad_ys=grads1) grads_wrt_initial_state_1 = tf.add(grads2, grads3) grads_wrt_initial_state_2 = tf.gradients(loss, initial_state, grad_ys=None) return grads_wrt_initial_state_1, grads_wrt_initial_state_2 output, output1, output2, initial_state, new_state1, new_state2 = model(input_1, input_2) loss, loss1, loss2 = loss(output, output1, output2, labels, labels1, labels2) grads_wrt_initial_state_1, grads_wrt_initial_state_2 = optimize(loss, loss1, loss2, initial_state, new_state1, new_state2) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) in1 = np.reshape(x_t[:time_steps1], newshape=(1, time_steps1, 1)) in2 = np.reshape(x_t[time_steps1:], newshape=(1, time_steps2, 1)) l1 = np.reshape(x_t_plus_1[:time_steps1], newshape=(1, time_steps1, 1)) l2 = np.reshape(x_t_plus_1[time_steps1:], newshape=(1, time_steps2, 1)) i_s = np.zeros([1, cell_size]) t1, t2 = sess.run([grads_wrt_initial_state_1, grads_wrt_initial_state_2], feed_dict={input_1: in1, input_2: in2, labels1: l1, labels2: l2, initial_state: i_s}) print(np.mean(t1), np.mean(t2)) print(np.sum(t1), np.sum(t2))中的代码以2种不同的方式进行了反向传播

使用Tensorflow中的GRU从先前时间步长到当前时间步长进行渐变

2 个答案: