我正在尝试使用Tensorflow中的PoolNet从Spotlight重新创建BPR loss,但我无法获得相同的结果。下面是我正在使用的模型(它是估算器model_fn)。
def _pooling_model_fn(features, labels, mode, params):
with tf.name_scope('inputs'):
if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
users_prev_items_inputs_train = features['item_seqs']
elif mode == tf.estimator.ModeKeys.PREDICT:
users_prev_items_inputs_train = tf.reshape(features['item_seqs'], [1, -1])
with tf.device('/cpu:0'):
prod_embeddings = tf.keras.layers.Embedding(params["num_items"], params["item_emb_size"], mask_zero=True)
item_biases = tf.keras.layers.Embedding(params["num_items"], 1, mask_zero=True, embeddings_initializer=tf.keras.initializers.Zeros())
prod_embed = prod_embeddings(users_prev_items_inputs_train)
targets = tf.transpose(prod_embed, [0, 2, 1])
sequence_embeddings = tf.expand_dims(targets, axis=3)
sequence_embeddings = tf.pad(sequence_embeddings, paddings=tf.constant([[0, 0], [0, 0], [1, 0], [0, 0]]))
sequence_embedding_sum = tf.cumsum(sequence_embeddings, 2)
non_padding_entries = tf.cumsum(tf.cast(tf.not_equal(sequence_embeddings, tf.constant(0.0)), tf.float32), 2) # .expand_as(sequence_embedding_sum)
user_representations = tf.squeeze((sequence_embedding_sum / (non_padding_entries + 1)), [3])
user_representations_so_far = user_representations[:, :, :-1]
user_representations_new = user_representations[:, :, -1]
if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
global_step = tf.contrib.framework.get_or_create_global_step()
with tf.name_scope('loss'):
negative_samples = features['neg_samp']
with tf.device('/cpu:0'):
prod_embed_pos = prod_embeddings(users_prev_items_inputs_train)
target_embedding_positive = tf.squeeze(tf.transpose(prod_embed_pos, [0, 2, 1]))
prod_bias_pos = item_biases(users_prev_items_inputs_train)
target_bias_positive = tf.squeeze(prod_bias_pos)
dot_positive = tf.reduce_sum(user_representations_so_far * target_embedding_positive, 1) + target_bias_positive
with tf.device('/cpu:0'):
prod_embed_neg = prod_embeddings(negative_samples)
target_embedding_negative = tf.squeeze(tf.transpose(prod_embed_neg, [0, 2, 1]))
prod_bias_neg = item_biases(negative_samples)
target_bias_negative = tf.squeeze(prod_bias_neg)
dot_negative = tf.reduce_sum(user_representations_so_far * target_embedding_negative, 1) + target_bias_negative
mask = tf.not_equal(users_prev_items_inputs_train, 0)
loss = bpr_loss(dot_positive, dot_negative, mask)
if mode == tf.estimator.ModeKeys.TRAIN:
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(learning_rate=params["lr"])
train_op = optimizer.minimize(loss, global_step=global_step)
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
if mode == tf.estimator.ModeKeys.PREDICT:
item_ids = np.arange(params['num_items']).reshape(-1, 1)
item_ids_tensor = tf.convert_to_tensor(item_ids, dtype=tf.int64)
with tf.device('/cpu:0'):
prod_embed_pos = prod_embeddings(item_ids_tensor) # tf.nn.embedding_lookup(prod_embeddings, item_ids_tensor)
target_embedding_positive = tf.squeeze(tf.transpose(prod_embed_pos, [0, 2, 1]))
prod_bias_pos = item_biases(item_ids_tensor) # tf.nn.embedding_lookup(item_biases, item_ids_tensor)
target_bias_positive = tf.squeeze(prod_bias_pos)
dot_positive = tf.reduce_sum(user_representations_new * target_embedding_positive, 1) + target_bias_positive
predictions = {
'products': tf.reshape(dot_positive, [1, -1])
}
export_outputs = {
'prediction': tf.estimator.export.PredictOutput(predictions)
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)
和损失函数
def bpr_loss(positive_predictions, negative_predictions, mask):
loss1 = 1.0 - tf.nn.sigmoid(positive_predictions - negative_predictions)
if mask is not None:
mask = tf.cast(mask, loss1.dtype)
final_loss = loss1 * mask
return tf.reduce_sum(final_loss) / tf.reduce_sum(mask)
return tf.reduce_mean(loss1)
使用上述模型,我无法对与Spotlight完全相同的数据集(和相同的随机种子)进行相同的预测。我最终认为问题在于零填充。生成数据的方式如下:
[[0,0,0,5,6,98],
[0,62,15,4,8,47],
[0,0,5,9,6,3,41],
[78,21,2,56,1,3]]
它们具有前导零填充,因此每个输入样本具有相同的长度。
基于我的代码,我相信我做了一切来掩盖丢失,嵌入层(使用Keras的mask_zero参数)以及我正在计算的嵌入的平均值(使用cumsum)中的这些零。 )。尽管如此,在训练之后,零索引嵌入也在不断变化(意味着不考虑排除,而是导致影响其余梯度并为我的结果增加噪声)。
Pytorch在Embedding layer的实现中似乎有一个很好的功能,你可以在其中设置padding_idx
的pad的id,这将用零初始化。此外,它使该索引的梯度始终为零。所以基本上,我试图用Tensorflow做同样的事情。
任何帮助都将不胜感激。
答案 0 :(得分:1)
我使用Tensorflow的Github上发布的以下solution解决了这个问题。它似乎现在有效。
mask_padding_zero_op = tf.scatter_update(lookup_table,
PADDING_ID,
tf.zeros([EMBEDDING_DIM,], dtype=DTYPE))
with tf.control_dependencies([mask_padding_zero_op]):
# do embedding lookup...