在下面的代码中,我使用10个转换层而不是LSTM来计算输出。
如果我使用1个Conv层然后使用LSTM它可以正常工作。但是,如果我开始添加更多转换层(下面的代码中有10个转换层),则损失会变得很大并且准确性开始降低。我在每个转换层之后应用了批量规范,以确保渐变不会消失。为了适应这个网络,我使用了5到10个例子,检查网络是否过度拟合,但它给了我巨大的损失,如果我减少转换层,它工作正常,如果我添加更多的例子,损失减少到一定然后停下来。这里的错误是什么?
编辑:如果您想尝试 - link
,这是可重现的代码X = tf.placeholder(tf.float32, [None,time_steps,embedding])
Y = tf.placeholder(tf.int32, [None])
A = tf.placeholder(tf.bool)
B = tf.placeholder(tf.float32)
x = tf.expand_dims(X,3)
filter_shape = [1, embedding, 1, 64]
conv_weights = tf.get_variable("conv_weights1" , filter_shape, tf.float32, tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases = tf.Variable(tf.constant(0.1, shape=[64]))
conv = tf.nn.conv2d(x, conv_weights, strides=[1,1,1,1], padding = "VALID")
normalize = tf.nn.elu(conv + conv_biases)
tf_normalize = tf.contrib.layers.batch_norm(inputs = normalize,is_training = A)
outputs_fed_lstm = tf_normalize
filter_shape2 = [1, 1, 64, 64]
conv_weights2 = tf.get_variable("conv_weights2" , filter_shape2, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases2 = tf.Variable(tf.constant(0.1, shape=[64]))
conv2 = tf.nn.conv2d(outputs_fed_lstm, conv_weights2, strides=[1,1,1,1], padding = "VALID")
normalize2 = tf.nn.elu(conv2 + conv_biases2)
tf_normalize2 = tf.contrib.layers.batch_norm(inputs = normalize2,is_training = A)
outputs_fed_lstm2 = tf_normalize2
filter_shape3 = [1, 1, 64, 64]
conv_weights3 = tf.get_variable("conv_weights3" , filter_shape3, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases3 = tf.Variable(tf.constant(0.1, shape=[64]))
conv3 = tf.nn.conv2d(outputs_fed_lstm2, conv_weights3, strides=[1,1,1,1], padding = "VALID")
normalize3 = tf.nn.elu(conv3 + conv_biases3)
tf_normalize3 = tf.contrib.layers.batch_norm(inputs = normalize3,is_training = A)
outputs_fed_lstm3 = tf_normalize3
filter_shape4 = [1, 1, 64, 128]
conv_weights4 = tf.get_variable("conv_weights4" , filter_shape4, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases4 = tf.Variable(tf.constant(0.1, shape=[128]))
conv4 = tf.nn.conv2d(outputs_fed_lstm3, conv_weights4, strides=[1,1,1,1], padding = "VALID")
normalize4 = tf.nn.elu(conv4 + conv_biases4)
tf_normalize4 = tf.contrib.layers.batch_norm(inputs = normalize4,is_training = A)
outputs_fed_lstm4 = tf_normalize4
filter_shape5 = [1, 1, 128, 128]
conv_weights5 = tf.get_variable("conv_weights5" , filter_shape5, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases5 = tf.Variable(tf.constant(0.1, shape=[128]))
conv5 = tf.nn.conv2d(outputs_fed_lstm4, conv_weights5, strides=[1,1,1,1], padding = "VALID")
normalize5 = tf.nn.elu(conv5 + conv_biases5)
tf_normalize5 = tf.contrib.layers.batch_norm(inputs = normalize5,is_training = A)
outputs_fed_lstm5 = tf_normalize5
filter_shape6 = [1, 1, 128, 128]
conv_weights6 = tf.get_variable("conv_weights6" , filter_shape6, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases6 = tf.Variable(tf.constant(0.1, shape=[128]))
conv6 = tf.nn.conv2d(outputs_fed_lstm5, conv_weights6, strides=[1,1,1,1], padding = "VALID")
normalize6 = tf.nn.elu(conv6 + conv_biases6)
tf_normalize6 = tf.contrib.layers.batch_norm(inputs = normalize6,is_training = A)
outputs_fed_lstm6 = tf_normalize6
filter_shape7 = [1, 1, 128, 256]
conv_weights7 = tf.get_variable("conv_weights7" , filter_shape7, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases7 = tf.Variable(tf.constant(0.1, shape=[256]))
conv7 = tf.nn.conv2d(outputs_fed_lstm6, conv_weights7, strides=[1,1,1,1], padding = "VALID")
normalize7 = tf.nn.elu(conv7 + conv_biases7)
tf_normalize7 = tf.contrib.layers.batch_norm(inputs = normalize7,is_training = A)
outputs_fed_lstm7 = tf_normalize7
filter_shape8 = [1, 1, 256, 256]
conv_weights8 = tf.get_variable("conv_weights8" , filter_shape8, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases8 = tf.Variable(tf.constant(0.1, shape=[256]))
conv8 = tf.nn.conv2d(outputs_fed_lstm7, conv_weights8, strides=[1,1,1,1], padding = "VALID")
normalize8 = tf.nn.elu(conv8 + conv_biases8)
tf_normalize8 = tf.contrib.layers.batch_norm(inputs = normalize8,is_training = A)
outputs_fed_lstm8 = tf_normalize8
filter_shape9 = [1, 1, 256, 256]
conv_weights9 = tf.get_variable("conv_weights9" , filter_shape9, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases9 = tf.Variable(tf.constant(0.1, shape=[256]))
conv9 = tf.nn.conv2d(outputs_fed_lstm8, conv_weights9, strides=[1,1,1,1], padding = "VALID")
normalize9 = tf.nn.elu(conv9 + conv_biases9)
tf_normalize9 = tf.contrib.layers.batch_norm(inputs = normalize9,is_training = A)
outputs_fed_lstm9 = tf_normalize9
filter_shape0 = [1, 1, 256, 512]
conv_weights0 = tf.get_variable("conv_weights0" , filter_shape0, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases0 = tf.Variable(tf.constant(0.1, shape=[512]))
conv0 = tf.nn.conv2d(outputs_fed_lstm9, conv_weights0, strides=[1,1,1,1], padding = "VALID")
normalize0 = tf.nn.elu(conv0 + conv_biases0)
tf_normalize0 = tf.contrib.layers.batch_norm(inputs = normalize0,is_training = A)
outputs_fed_lstm0 = tf_normalize0
outputs_fed_lstm10 = tf.nn.dropout(x = outputs_fed_lstm0, keep_prob = B)
x = tf.squeeze(outputs_fed_lstm10, [2])
x = tf.transpose(x, [1, 0, 2])
x = tf.reshape(x, [-1, 512])
x = tf.split(0, time_steps, x)
lstm = tf.nn.rnn_cell.LSTMCell(num_units = _units, state_is_tuple=True)
# multi_lstm = tf.nn.rnn_cell.MultiRNNCell([lstm] * lstm_layers, state_is_tuple = True)
outputs , state = tf.nn.rnn(lstm,x, dtype = tf.float32)
weights = tf.Variable(tf.random_normal([_units,num_classes]))
biases = tf.Variable(tf.random_normal([num_classes]))
logits = tf.matmul(outputs[-1], weights) + biases
c_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits,Y)
loss = tf.reduce_mean(c_loss)
global_step = tf.Variable(0, name="global_step", trainable=False)
decayed_learning_rate = tf.train.exponential_decay(learning_rate = 0.01,global_step = global_step,decay_steps = 300, decay_rate = 0.96, staircase = True)
optimizer= tf.train.AdamOptimizer(learning_rate = decayed_learning_rate)
#grads_and_vars = optimizer.compute_gradients(loss,[conv_weights0])
minimize_loss = optimizer.minimize(loss, global_step=global_step)
correct_predict = tf.nn.in_top_k(logits, Y, 1)
accuracy = tf.reduce_mean(tf.cast(correct_predict, tf.float32))
答案 0 :(得分:1)
我已经想到了这一点,为什么会发生这种情况,当我们随机增加神经元或层数时,如56,86然后是496,那么无论你添加了多少层,这种问题都会发生,结果将是巨大的损失和非常低的准确性,因此这个问题的解决方案遵循特定的模式,如64,128,256,512。