Question

我有一个深度学习 Tensorflow 代码，为了训练，我使用了 10 倍交叉验证。模型在第一次折叠时训练得很好，但是当第二次折叠开始时，第一层的输出是 nan 并且模型没有训练。输入数据正确（我检查过）但我不知道模型中发生了什么。您可以在 https://github.com/Chang-Li-HFUT/ACRNN 查看模型。

使用函数创建的第一层，在该函数中，有两个变量，这两个变量在第一次折叠中为空，但对于第二次折叠，这两个变量具有 nan 值。如何在函数中重置这些变量？

channel-wise-attention 函数：

import tensorflow as tf

def channel_wise_attention(feature_map, H, W, C, weight_decay=0.00004, scope='', reuse=None):
    """This method is used to add spatial attention to model.
    
    Parameters
    ---------------
    @feature_map: Which visual feature map as branch to use.
    @K: Map `H*W` units to K units. Now unused.
    @reuse: reuse variables if use multi gpus.
    
    Return
    ---------------
    @attended_fm: Feature map with Channel-Wise Attention.
    """
    with tf.variable_scope(scope, 'ChannelWiseAttention', reuse=reuse):
        # Tensorflow's tensor is in BHWC format. H for row split while W for column split.
        # _, H, W, C = tuple([int(x) for x in feature_map.get_shape()])
        weight = tf.get_variable("weight", [C, C],
                              dtype=tf.float32,
                              initializer=tf.initializers.orthogonal,
                              regularizer=tf.contrib.layers.l2_regularizer(weight_decay))
        bias = tf.get_variable("bias", [C],
                              dtype=tf.float32,
                              initializer=tf.initializers.zeros)
        print(weight)
        print(bias)
        print('**************************************************************')
        transpose_feature_map = tf.transpose(tf.reduce_mean(feature_map, [1, 2], keep_dims=True),
                                             perm=[0, 3, 1, 2])
        channel_wise_attention_fm = tf.matmul(tf.reshape(transpose_feature_map, 
                                                         [-1, C]), weight) + bias
        channel_wise_attention_fm = tf.nn.sigmoid(channel_wise_attention_fm)
#         channel_wise_attention_fm = tf.clip_by_value(tf.nn.relu(channel_wise_attention_fm), 
#                                                      clip_value_min = 0, 
#                                                      clip_value_max = 1)
        attention = tf.reshape(tf.concat([channel_wise_attention_fm] * (H * W),
                                         axis=1), [-1, H, W, C])
        attended_fm = attention * feature_map
        return attended_fm, weight, bias

模型是：

# input placeholder
X = tf.placeholder(tf.float32, shape=[None, input_height, input_width, input_channel_num], name = 'X')
Y = tf.placeholder(tf.float32, shape=[None, num_labels], name = 'Y')
train_phase = tf.placeholder(tf.bool, name = 'train_phase')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')

# channel-wise attention layer
X_1 = tf.transpose(X,[0, 3, 2, 1])
# print(X_1)
conv, weight, bias = channel_wise_attention(X_1, 1, window_size, n_channel, weight_decay=0.00004, scope='', reuse=None)
conv_1 = tf.transpose(conv,[0, 3, 2, 1])
# print(conv_1)

# CNN layer
conv_1 = cnn_2d.apply_conv2d(conv_1, kernel_height_1st, kernel_width_1st, input_channel_num, conv_channel_num, kernel_stride, train_phase)
print("conv 1 shape: ", conv_1.get_shape().as_list())
pool_1 = cnn_2d.apply_max_pooling(conv_1, pooling_height_1st, pooling_width_1st, pooling_stride_1st)
print("pool 1 shape: ", pool_1.get_shape().as_list())
pool_1_shape = pool_1.get_shape().as_list()
pool1_flat = tf.reshape(pool_1, [-1, pool_1_shape[1]*pool_1_shape[2]*pool_1_shape[3]])
fc_drop = tf.nn.dropout(pool1_flat, keep_prob)

# LSTMs layer
lstm_in = tf.reshape(fc_drop, [-1, num_timestep, pool_1_shape[1]*pool_1_shape[2]*pool_1_shape[3]])
cells = []
for _ in range(2):
    cell = tf.contrib.rnn.BasicLSTMCell(n_hidden_state, forget_bias=1.0, state_is_tuple=True)
    cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
    cells.append(cell)
    lstm_cell = tf.contrib.rnn.MultiRNNCell(cells)

    init_state = lstm_cell.zero_state(batch_size, dtype=tf.float32)

    # output ==> [batch, step, n_hidden_state]
    rnn_op, states = tf.nn.dynamic_rnn(lstm_cell, lstm_in, initial_state=init_state, time_major=False)

#self-attention layer
with tf.name_scope('Attention_layer'):
    attention_op = multi_dimensional_attention(rnn_op, 64, scope=None,
                                               keep_prob=1., is_train=None, wd=0., activation='elu',
                                               tensor_dict=None, name=None)

    attention_drop = tf.nn.dropout(attention_op, keep_prob)

    y_ = cnn_2d.apply_readout(attention_drop, rnn_op.shape[2].value, num_labels)

# softmax layer: probability prediction
y_prob = tf.nn.softmax(y_, name = "y_prob")

# class prediction
y_pred = tf.argmax(y_prob, 1, name = "y_pred")
# y_pred = tf.cast(y_pred, tf.float32)

# cross entropy cost function

# crossE = tf.nn.softmax_cross_entropy_with_logits(logits=y_, labels=Y)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y_, labels=Y), name = 'loss')
# cost = tf.losses.softmax_cross_entropy(onehot_labels=Y, logits=y_pred)
# cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=tf.cast(tf.argmax(tf.nn.softmax(y_), 1), tf.float32), labels=Y), name = 'loss')
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    # set training SGD optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

# get correctly predicted object
correct_prediction = tf.equal(tf.argmax(tf.nn.softmax(y_), 1), tf.argmax(Y, 1))

# calculate prediction accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name = 'accuracy')

张量流模型

0 个答案: