Question

我正在执行predict the shop people will click given the history click sequence of this customer的预测任务。我使用rnn建模数据，并将序列填充到每个批次的最大长度。但是当我运行该程序时，我发现精度为0。

要检查原因，我打印了rnn的隐藏状态，发现开始时，隐藏状态看起来很正常，如下所示：

0.032015    -0.035735   -0.029594   0.005365    -0.071770   -0.025201   -0.004161   0.020350    0.039628    -0.007609   0.024006    -0.025892   -0.018199   -0.031058   0.030478    -0.021961   0.029184    0.029706    -0.029774   -0.006685

但是逐渐变为1和-1，就像：

    1.000000    -0.682048   -1.000000   -1.000000   -1.000000   0.999937    -1.000000   1.000000    1.000000    1.000000    0.787266    -1.000000   -0.747823   -1.000000   -1.000000   -1.000000   0.999986    0.884828    -0.999983   1.000000

具体来说，数据集中的车间数约为2 million，序列数约为20 million，并且过滤长度小于5或大于30的序列。平均序列长度约为10。例如，如果一个序列为3 5 7 10 22 58 6，则该序列的输入为3 5 7 10 22 58，相应的标签为5 7 10 22 58 6。

为了节省时间，我在训练期间使用sampled_softmax_loss。当进行预测时，我使用当前批次中出现的商店ID作为目标类别。我知道这是有问题的，因为有些商店会多次出现，但是我不知道任何其他更好的方法，而且我认为这不是主要问题，因为隐藏状态问题会使训练失败，即使我使用完美的方法（完全softmax？）来预测准确性仍然会很差。

现在我不知道为什么隐藏状态变为接近1和-1，以及如何解决此问题以使精度可以接受。

有人会提出建议吗？

我的程序和执行此任务的方法可能还有其他问题，如果有的话，请告诉我。

任何建议将不胜感激！

谢谢前进！

下面是我简化的模型定义，我省略了训练和前提部分。

import numpy as np
from tensorflow.contrib.rnn.python.ops import rnn_cell
import tensorflow as tf

n_steps = 3
n_inputs = 3
n_neurons = 5
class_num = 20 
batch_size = 4
max_length= 30

tf.reset_default_graph()
X = tf.placeholder(dtype=tf.int32, shape=[None, n_steps])
Y = tf.placeholder(tf.int32, [None, n_steps], name='output')
seq_length = tf.placeholder(tf.int32, [None])

initializer = tf.random_normal_initializer(0.0, 0.1)
embedding = tf.get_variable('embedding', [class_num , n_inputs], initializer=initializer)
softmax_W = tf.get_variable('softmax_w', [class_num , n_neurons], initializer=initializer)
softmax_b = tf.get_variable('softmax_b', [class_num], initializer=initializer)

inputs = tf.nn.embedding_lookup(embedding, X)

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)

stacked_cell = tf.contrib.rnn.MultiRNNCell( [basic_cell]*2)

init_state = stacked_cell.zero_state(batch_size, tf.float32)

outputs, states = tf.nn.dynamic_rnn(stacked_cell, inputs, sequence_length=seq_length, initial_state = init_state,dtype=tf.float32)

output_reshaped = tf.reshape(outputs, [ -1, n_neurons ] )

#generate sequence mask to omit the loss of the padded position of the sequence.
lower_triangular_ones = tf.constant(np.tril(np.ones([max_length])),dtype=tf.int32)
seqlen_mask = tf.slice(tf.gather(lower_triangular_ones, seq_length - 1),[0, 0], [batch_size, tf.reduce_max(seq_length)])

seqlen_mask_reshaped = tf.reshape( seqlen_mask , [ -1] )

Y_reshaped = tf.reshape( Y, [-1,1 ] )

if task == 'train':
    cost_with_mask_sample = tf.nn.sampled_softmax_loss(weights=softmax_W, 
                                                     biases=softmax_b, 
                                                     labels=Y_reshaped, 
                                                     inputs=output_reshaped ,
                                                     num_sampled=5,
                                                     num_classes=20, partition_strategy="div")

    pure_cost = cost_with_mask_sample * tf.cast(seqlen_mask_reshaped, tf.float32)

    cost = tf.reduce_sum( pure_cost ) / tf.reduce_sum( tf.cast(seqlen_mask_reshaped, tf.float32) )

elif task == 'evaluate':
#use shop occurred in the batch as target class as the full softmax is infeasible

    sampled_W = tf.nn.embedding_lookup(softmax_W, Y)
    sampled_b = tf.nn.embedding_lookup(softmax_b, Y)

    tmp_logits = tf.matmul(output_reshaped, tf.reshape(sampled_W, [-1, n_neurons]), transpose_b=True)
    logits = tmp_logits  + tf.reshape(sampled_b, [-1])
    preds = tf.nn.softmax(logits)
    diag_preds = tf.diag_part(preds)
    unmasked_loss = -tf.log(diag_preds+1e-24)
    unmasked_loss_reshaped = tf.reshape( unmasked_loss , [-1] )
    masked_loss = unmasked_loss_reshaped * tf.cast(seqlen_mask_reshaped, tf.float32)
    predict_cost = tf.reduce_sum(masked_loss) / tf.reduce_sum( tf.cast(seqlen_mask_reshaped, tf.float32) )
    result = preds > tf.reshape(diag_preds, [-1,1])
    padded_ranks = tf.reduce_sum( tf.cast( result, tf.int32) , axis=1 )  + 1
    rank_ok = tf.cast(padded_ranks <20, tf.int32) * seqlen_mask_reshaped
    acc_ok = (padded_ranks == 1) * seqlen_mask_reshaped

#just to check the program 
X_batch = np.array([
  # t = 0      t = 1     t=2
  [2, 5, 8], # instance 0
  [3, 2, 0], # instance 1
  [7, 12,0], #instance 2
  [17,15,11], #instance 3
])

Y_batch = np.array([
     # t = 0      t = 1     t=2
  [5,8,10], # instance 0
  [2,0, 0], # instance 1
  [12, 0, 0 ], #instance 2
  [15, 11,0 ], #instance 3 

])
seq_length_batch = np.array([3, 2, 2, 3])

with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())

  outputs_val,softmax_W_val, sampled_W_val,sampled_b_val, tmp_logits_val, logits_val,preds_val,diag_preds_val,unmasked_loss_val, masked_loss_val,predict_cost_val,result_val,padded_ranks_val,rank_ok_val, acc_ok_val= sess.run([outputs, softmax_W,sampled_W, sampled_b, tmp_logits, logits, preds, diag_preds,unmasked_loss,masked_loss, predict_cost, result, padded_ranks, rank_ok, acc_ok], 
                                     feed_dict={X: X_batch, Y: Y_batch, seq_length: seq_length_batch})


  print('outputs:')
  print(outputs_val)
  print('softmax_W_val')
  print(softmax_W_val)
  print('\nsampled_W_val:')
  print(sampled_W_val)

  print('sampled_b_val')
  print( sampled_b_val)
  print('tmp_logits_val')
  print(tmp_logits_val)
 # print(states_val[1][0])
  print('logits_val:')
  print(logits_val)
  print('preds_val')
  print(preds_val)
  print('diag_preds_val')
  print(diag_preds_val)
  print('unmasked_loss_val')
  print(unmasked_loss_val)
  print('masked_loss_val')
  print(masked_loss_val)
  print('predict_cost_val')
  print(predict_cost_val)
  print('result_val')
  print(result_val)
  print('padded_ranks_val')
  print(padded_ranks_val)
  print('rank_ok_val')
  print(rank_ok_val)
  print('acc_ok_val')
  print(acc_ok_val)

为什么训练中rnn的隐藏状态几乎变为-1或1

0 个答案: