我正在尝试使用以下代码实现自定义Tensorflow估算器并运行到Nan Loss during training error
堆栈追踪:
Traceback (most recent call last):
File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/mnt/release/scripts/gcloudml/projections_learning/trainer-x.x.x/trainer/task.py", line 212, in <module>
learn_runner.run(generate_experiment_fn(**arguments), job_dir)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 209, in run
return _execute_schedule(experiment, schedule)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 46, in _execute_schedule
return task()
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 502, in train_and_evaluate
self.train(delay_secs=0)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 280, in train
hooks=self._train_monitors + extra_hooks)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 677, in _call_train
monitors=hooks)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/deprecation.py", line 296, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 458, in fit
loss = self._train_model(input_fn=input_fn, hooks=hooks)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.py", line 1010, in _train_model
_, loss = mon_sess.run([model_fn_ops.train_op, model_fn_ops.loss])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 518, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 862, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 818, in run
return self._sess.run(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 980, in run
run_metadata=run_metadata))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/basic_session_run_hooks.py", line 551, in after_run
raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.
我正在关注https://github.com/GoogleCloudPlatform/cloudml-samples/blob/master/census/estimator/trainer/model.py的示例代码,修改后使用自定义估算工具代替DNNLinearCombinedClassifier。
如果我在这里遗漏了什么,请告诉我。
P.S。:我可以在我的数据上使用DNNLinearCombinedClassifier进行训练而没有任何问题。
模型fn:
def build_estimator(self, model_dir, hidden_units):
def model_fn(features, labels, mode, params):
input_layer = tf.feature_column.input_layer(features=features,
feature_columns=params['feature_columns'])
hidden_layer_units = params['hidden_layers']
net = tf.layers.dense(input_layer, hidden_layer_units[0],
activation=tf.nn.relu)
for num_hidden_units in hidden_layer_units[1:]:
net = tf.layers.dense(net, num_hidden_units,
activation=tf.nn.relu)
output_layer = tf.layers.dense(net, 1)
# Reshape output layer to 1-dim Tensor to return predictions
predictions = tf.reshape(output_layer, [-1])
# Calculate loss using mean squared error
loss = tf.losses.mean_squared_error(labels, output_layer)
# Calculate root mean squared error as additional eval metric
eval_metric_ops = {
"rmse": tf.metrics.root_mean_squared_error(
tf.cast(labels, tf.float32), predictions)
}
optimizer = tf.train.GradientDescentOptimizer(
learning_rate=params["learning_rate"])
train_op = optimizer.minimize(
loss=loss, global_step=tf.train.get_global_step())
# Provide an estimator spec for `ModeKeys.EVAL` and `ModeKeys.TRAIN` modes.
'''
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op)
'''
return ModelFnOps(mode=mode,
loss=loss,
train_op=train_op,
predictions=predictions)
model_params = {"learning_rate": 0.01,
"hidden_layers": hidden_units,
"feature_columns": feature_columns}
return tf.contrib.learn.Estimator(model_fn=model_fn, params=model_params, model_dir=model_dir)