我用以下代码构建了一个支持稀疏特征向量的自定义LR模型:
def custom_model_fn(features, labels, mode, params):
linear_bias = tf.get_variable(name='linear_bias',
shape=[1],
dtype=tf.float32,
initializer=tf.random_normal_initializer(stddev=0.0001))
linear_w = tf.get_variable(name='linear_w',
shape=[params['feature_size'], 1],
dtype=tf.float32,
initializer=tf.random_normal_initializer(stddev=0.0001),
partitioner=self.partitioner)
# wx
# size: [batch_size, 1]
logits_wide = tf.nn.embedding_lookup_sparse(params=linear_w,
sp_ids=features['featureID'],
sp_weights=None,
combiner='sum')
# wx + b
logits = linear_bias + logits_wide
logits_adjusted = logits + tf.math.log(params['negative_sampling_rate'])
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'probabilities': tf.nn.sigmoid(logits_adjusted),
'logits': logits,
'logits_adjusted': logits_adjusted
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
else:
loss = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(labels, dtype=tf.float32),
logits=logits))
if mode == tf.estimator.ModeKeys.EVAL:
auc = tf.metrics.auc(
labels=labels,
predictions=1 / (1 + tf.math.exp(-logits_adjusted)),
num_thresholds=400,
curve='ROC',
summation_method='careful_interpolation')
logloss = tf.metrics.mean(tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.cast(labels, dtype=tf.float32),
logits=logits_adjusted))
tf.summary.scalar('True_AUC', auc)
tf.summary.scalar('True_Logloss', logloss)
metrics = {
'True_AUC': auc,
'True_Logloss': logloss
}
predictions = {
'probabilities': tf.nn.sigmoid(logits_adjusted),
'logits': logits,
'logits_adjusted': logits_adjusted
}
return tf.estimator.EstimatorSpec(mode, loss=loss, predictions=predictions,
eval_metric_ops=metrics)
elif mode == tf.estimator.ModeKeys.TRAIN:
train_op = self.optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
我使用的分区程序是tf.fixed_size_partitioner,以ps数作为参数。当我仅使用一个PS运行此代码时,我得到auc = 0.87,这是正确的。但是,当我使用多个ps(ps_num> 1)时,我总是得到auc = 0.5。我检查了该图,该分区程序已成功在PS之间分配linear_w。而且global_step上升到30,000+,这也表明优化程序正在运行。我在分布式TF中错过了任何导致此问题的内容吗?
答案 0 :(得分:0)
您需要在训练和评估中保持代码相同。这意味着即使在单个工人上,也需要使用tf.get_variable(xxx,parititioner=tf.fixed_size_partitioner(6))
,数字6取决于您在火车上使用了多少。