我正在执行predict the shop people will click given the history click sequence of this customer
的预测任务。我使用rnn建模数据,并将序列填充到每个批次的最大长度。但是当我运行该程序时,我发现精度为0。
要检查原因,我打印了rnn的隐藏状态,发现开始时,隐藏状态看起来很正常,如下所示:
0.032015 -0.035735 -0.029594 0.005365 -0.071770 -0.025201 -0.004161 0.020350 0.039628 -0.007609 0.024006 -0.025892 -0.018199 -0.031058 0.030478 -0.021961 0.029184 0.029706 -0.029774 -0.006685
但是逐渐变为1和-1,就像:
1.000000 -0.682048 -1.000000 -1.000000 -1.000000 0.999937 -1.000000 1.000000 1.000000 1.000000 0.787266 -1.000000 -0.747823 -1.000000 -1.000000 -1.000000 0.999986 0.884828 -0.999983 1.000000
具体来说,数据集中的车间数约为2 million
,序列数约为20 million
,并且过滤长度小于5或大于30的序列。平均序列长度约为10
。例如,如果一个序列为3 5 7 10 22 58 6
,则该序列的输入为3 5 7 10 22 58
,相应的标签为5 7 10 22 58 6
。
为了节省时间,我在训练期间使用sampled_softmax_loss
。当进行预测时,我使用当前批次中出现的商店ID作为目标类别。我知道这是有问题的,因为有些商店会多次出现,但是我不知道任何其他更好的方法,而且我认为这不是主要问题,因为隐藏状态问题会使训练失败,即使我使用完美的方法(完全softmax?)来预测准确性仍然会很差。
现在我不知道为什么隐藏状态变为接近1和-1,以及如何解决此问题以使精度可以接受。
有人会提出建议吗?
我的程序和执行此任务的方法可能还有其他问题,如果有的话,请告诉我。
任何建议将不胜感激!
谢谢前进!
下面是我简化的模型定义,我省略了训练和前提部分。
import numpy as np
from tensorflow.contrib.rnn.python.ops import rnn_cell
import tensorflow as tf
n_steps = 3
n_inputs = 3
n_neurons = 5
class_num = 20
batch_size = 4
max_length= 30
tf.reset_default_graph()
X = tf.placeholder(dtype=tf.int32, shape=[None, n_steps])
Y = tf.placeholder(tf.int32, [None, n_steps], name='output')
seq_length = tf.placeholder(tf.int32, [None])
initializer = tf.random_normal_initializer(0.0, 0.1)
embedding = tf.get_variable('embedding', [class_num , n_inputs], initializer=initializer)
softmax_W = tf.get_variable('softmax_w', [class_num , n_neurons], initializer=initializer)
softmax_b = tf.get_variable('softmax_b', [class_num], initializer=initializer)
inputs = tf.nn.embedding_lookup(embedding, X)
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
stacked_cell = tf.contrib.rnn.MultiRNNCell( [basic_cell]*2)
init_state = stacked_cell.zero_state(batch_size, tf.float32)
outputs, states = tf.nn.dynamic_rnn(stacked_cell, inputs, sequence_length=seq_length, initial_state = init_state,dtype=tf.float32)
output_reshaped = tf.reshape(outputs, [ -1, n_neurons ] )
#generate sequence mask to omit the loss of the padded position of the sequence.
lower_triangular_ones = tf.constant(np.tril(np.ones([max_length])),dtype=tf.int32)
seqlen_mask = tf.slice(tf.gather(lower_triangular_ones, seq_length - 1),[0, 0], [batch_size, tf.reduce_max(seq_length)])
seqlen_mask_reshaped = tf.reshape( seqlen_mask , [ -1] )
Y_reshaped = tf.reshape( Y, [-1,1 ] )
if task == 'train':
cost_with_mask_sample = tf.nn.sampled_softmax_loss(weights=softmax_W,
biases=softmax_b,
labels=Y_reshaped,
inputs=output_reshaped ,
num_sampled=5,
num_classes=20, partition_strategy="div")
pure_cost = cost_with_mask_sample * tf.cast(seqlen_mask_reshaped, tf.float32)
cost = tf.reduce_sum( pure_cost ) / tf.reduce_sum( tf.cast(seqlen_mask_reshaped, tf.float32) )
elif task == 'evaluate':
#use shop occurred in the batch as target class as the full softmax is infeasible
sampled_W = tf.nn.embedding_lookup(softmax_W, Y)
sampled_b = tf.nn.embedding_lookup(softmax_b, Y)
tmp_logits = tf.matmul(output_reshaped, tf.reshape(sampled_W, [-1, n_neurons]), transpose_b=True)
logits = tmp_logits + tf.reshape(sampled_b, [-1])
preds = tf.nn.softmax(logits)
diag_preds = tf.diag_part(preds)
unmasked_loss = -tf.log(diag_preds+1e-24)
unmasked_loss_reshaped = tf.reshape( unmasked_loss , [-1] )
masked_loss = unmasked_loss_reshaped * tf.cast(seqlen_mask_reshaped, tf.float32)
predict_cost = tf.reduce_sum(masked_loss) / tf.reduce_sum( tf.cast(seqlen_mask_reshaped, tf.float32) )
result = preds > tf.reshape(diag_preds, [-1,1])
padded_ranks = tf.reduce_sum( tf.cast( result, tf.int32) , axis=1 ) + 1
rank_ok = tf.cast(padded_ranks <20, tf.int32) * seqlen_mask_reshaped
acc_ok = (padded_ranks == 1) * seqlen_mask_reshaped
#just to check the program
X_batch = np.array([
# t = 0 t = 1 t=2
[2, 5, 8], # instance 0
[3, 2, 0], # instance 1
[7, 12,0], #instance 2
[17,15,11], #instance 3
])
Y_batch = np.array([
# t = 0 t = 1 t=2
[5,8,10], # instance 0
[2,0, 0], # instance 1
[12, 0, 0 ], #instance 2
[15, 11,0 ], #instance 3
])
seq_length_batch = np.array([3, 2, 2, 3])
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
outputs_val,softmax_W_val, sampled_W_val,sampled_b_val, tmp_logits_val, logits_val,preds_val,diag_preds_val,unmasked_loss_val, masked_loss_val,predict_cost_val,result_val,padded_ranks_val,rank_ok_val, acc_ok_val= sess.run([outputs, softmax_W,sampled_W, sampled_b, tmp_logits, logits, preds, diag_preds,unmasked_loss,masked_loss, predict_cost, result, padded_ranks, rank_ok, acc_ok],
feed_dict={X: X_batch, Y: Y_batch, seq_length: seq_length_batch})
print('outputs:')
print(outputs_val)
print('softmax_W_val')
print(softmax_W_val)
print('\nsampled_W_val:')
print(sampled_W_val)
print('sampled_b_val')
print( sampled_b_val)
print('tmp_logits_val')
print(tmp_logits_val)
# print(states_val[1][0])
print('logits_val:')
print(logits_val)
print('preds_val')
print(preds_val)
print('diag_preds_val')
print(diag_preds_val)
print('unmasked_loss_val')
print(unmasked_loss_val)
print('masked_loss_val')
print(masked_loss_val)
print('predict_cost_val')
print(predict_cost_val)
print('result_val')
print(result_val)
print('padded_ranks_val')
print(padded_ranks_val)
print('rank_ok_val')
print(rank_ok_val)
print('acc_ok_val')
print(acc_ok_val)