Question

在下面的代码中，我使用10个转换层而不是LSTM来计算输出。

如果我使用1个Conv层然后使用LSTM它可以正常工作。但是，如果我开始添加更多转换层（下面的代码中有10个转换层），则损失会变得很大并且准确性开始降低。我在每个转换层之后应用了批量规范，以确保渐变不会消失。为了适应这个网络，我使用了5到10个例子，检查网络是否过度拟合，但它给了我巨大的损失，如果我减少转换层，它工作正常，如果我添加更多的例子，损失减少到一定然后停下来。这里的错误是什么？

编辑：如果您想尝试 - link

，这是可重现的代码

X = tf.placeholder(tf.float32, [None,time_steps,embedding])
Y = tf.placeholder(tf.int32, [None])
A = tf.placeholder(tf.bool)
B = tf.placeholder(tf.float32)

x = tf.expand_dims(X,3)

filter_shape = [1, embedding, 1, 64]
conv_weights = tf.get_variable("conv_weights1" , filter_shape, tf.float32, tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases = tf.Variable(tf.constant(0.1, shape=[64]))
conv = tf.nn.conv2d(x, conv_weights, strides=[1,1,1,1], padding = "VALID")
normalize = tf.nn.elu(conv + conv_biases)
tf_normalize = tf.contrib.layers.batch_norm(inputs = normalize,is_training = A)
outputs_fed_lstm = tf_normalize

filter_shape2 = [1, 1, 64, 64]
conv_weights2 = tf.get_variable("conv_weights2" , filter_shape2, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases2 = tf.Variable(tf.constant(0.1, shape=[64]))
conv2 = tf.nn.conv2d(outputs_fed_lstm, conv_weights2, strides=[1,1,1,1], padding = "VALID")
normalize2 = tf.nn.elu(conv2 + conv_biases2)
tf_normalize2 = tf.contrib.layers.batch_norm(inputs = normalize2,is_training = A)
outputs_fed_lstm2 = tf_normalize2

filter_shape3 = [1, 1, 64, 64]
conv_weights3 = tf.get_variable("conv_weights3" , filter_shape3, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases3 = tf.Variable(tf.constant(0.1, shape=[64]))
conv3 = tf.nn.conv2d(outputs_fed_lstm2, conv_weights3, strides=[1,1,1,1], padding = "VALID")
normalize3 = tf.nn.elu(conv3 + conv_biases3)
tf_normalize3 = tf.contrib.layers.batch_norm(inputs = normalize3,is_training = A)
outputs_fed_lstm3 = tf_normalize3

filter_shape4 = [1, 1, 64, 128]
conv_weights4 = tf.get_variable("conv_weights4" , filter_shape4, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases4 = tf.Variable(tf.constant(0.1, shape=[128]))
conv4 = tf.nn.conv2d(outputs_fed_lstm3, conv_weights4, strides=[1,1,1,1], padding = "VALID")
normalize4 = tf.nn.elu(conv4 + conv_biases4)
tf_normalize4 = tf.contrib.layers.batch_norm(inputs = normalize4,is_training = A)
outputs_fed_lstm4 = tf_normalize4

filter_shape5 = [1, 1, 128, 128]
conv_weights5 = tf.get_variable("conv_weights5" , filter_shape5, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases5 = tf.Variable(tf.constant(0.1, shape=[128]))
conv5 = tf.nn.conv2d(outputs_fed_lstm4, conv_weights5, strides=[1,1,1,1], padding = "VALID")
normalize5 = tf.nn.elu(conv5 + conv_biases5)
tf_normalize5 = tf.contrib.layers.batch_norm(inputs = normalize5,is_training = A)
outputs_fed_lstm5 = tf_normalize5

filter_shape6 = [1, 1, 128, 128]
conv_weights6 = tf.get_variable("conv_weights6" , filter_shape6, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases6 = tf.Variable(tf.constant(0.1, shape=[128]))
conv6 = tf.nn.conv2d(outputs_fed_lstm5, conv_weights6, strides=[1,1,1,1], padding = "VALID")
normalize6 = tf.nn.elu(conv6 + conv_biases6)
tf_normalize6 = tf.contrib.layers.batch_norm(inputs = normalize6,is_training = A)
outputs_fed_lstm6 = tf_normalize6  

filter_shape7 = [1, 1, 128, 256]
conv_weights7 = tf.get_variable("conv_weights7" , filter_shape7, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases7 = tf.Variable(tf.constant(0.1, shape=[256]))
conv7 = tf.nn.conv2d(outputs_fed_lstm6, conv_weights7, strides=[1,1,1,1], padding = "VALID")
normalize7 = tf.nn.elu(conv7 + conv_biases7)
tf_normalize7 = tf.contrib.layers.batch_norm(inputs = normalize7,is_training = A)
outputs_fed_lstm7 = tf_normalize7 

filter_shape8 = [1, 1, 256, 256]
conv_weights8 = tf.get_variable("conv_weights8" , filter_shape8, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases8 = tf.Variable(tf.constant(0.1, shape=[256]))
conv8 = tf.nn.conv2d(outputs_fed_lstm7, conv_weights8, strides=[1,1,1,1], padding = "VALID")
normalize8 = tf.nn.elu(conv8 + conv_biases8)
tf_normalize8 = tf.contrib.layers.batch_norm(inputs = normalize8,is_training = A)
outputs_fed_lstm8 = tf_normalize8 

filter_shape9 = [1, 1, 256, 256]
conv_weights9 = tf.get_variable("conv_weights9" , filter_shape9, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases9 = tf.Variable(tf.constant(0.1, shape=[256]))
conv9 = tf.nn.conv2d(outputs_fed_lstm8, conv_weights9, strides=[1,1,1,1], padding = "VALID")
normalize9 = tf.nn.elu(conv9 + conv_biases9)
tf_normalize9 = tf.contrib.layers.batch_norm(inputs = normalize9,is_training = A)
outputs_fed_lstm9 = tf_normalize9 

filter_shape0 = [1, 1, 256, 512]
conv_weights0 = tf.get_variable("conv_weights0" , filter_shape0, tf.float32,tf.truncated_normal_initializer(mean=0.0, stddev=1.0))
conv_biases0 = tf.Variable(tf.constant(0.1, shape=[512]))
conv0 = tf.nn.conv2d(outputs_fed_lstm9, conv_weights0, strides=[1,1,1,1], padding = "VALID")
normalize0 = tf.nn.elu(conv0 + conv_biases0)
tf_normalize0 = tf.contrib.layers.batch_norm(inputs = normalize0,is_training = A)
outputs_fed_lstm0 = tf_normalize0 


outputs_fed_lstm10 = tf.nn.dropout(x = outputs_fed_lstm0, keep_prob = B) 


x = tf.squeeze(outputs_fed_lstm10, [2])     
x = tf.transpose(x, [1, 0, 2])
x = tf.reshape(x, [-1, 512])
x = tf.split(0, time_steps, x)

lstm = tf.nn.rnn_cell.LSTMCell(num_units = _units, state_is_tuple=True)

# multi_lstm = tf.nn.rnn_cell.MultiRNNCell([lstm] * lstm_layers, state_is_tuple = True)

outputs , state = tf.nn.rnn(lstm,x, dtype = tf.float32)     

weights = tf.Variable(tf.random_normal([_units,num_classes]))
biases  = tf.Variable(tf.random_normal([num_classes]))

logits = tf.matmul(outputs[-1], weights) + biases

c_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits,Y)
loss = tf.reduce_mean(c_loss)


global_step = tf.Variable(0, name="global_step", trainable=False)
decayed_learning_rate = tf.train.exponential_decay(learning_rate = 0.01,global_step = global_step,decay_steps = 300, decay_rate = 0.96, staircase = True)
optimizer= tf.train.AdamOptimizer(learning_rate = decayed_learning_rate)
#grads_and_vars = optimizer.compute_gradients(loss,[conv_weights0])
minimize_loss = optimizer.minimize(loss, global_step=global_step)   

correct_predict = tf.nn.in_top_k(logits, Y, 1)
accuracy = tf.reduce_mean(tf.cast(correct_predict, tf.float32))

Answer 1

我已经想到了这一点，为什么会发生这种情况，当我们随机增加神经元或层数时，如56,86然后是496，那么无论你添加了多少层，这种问题都会发生，结果将是巨大的损失和非常低的准确性，因此这个问题的解决方案遵循特定的模式，如64,128,256,512。

添加更多转换层会增加损耗并降低精度，Tensorflow

1 个答案: