我正在使用设置为CNN的张量流估算器,每次运行我的代码时都会出现此错误:
ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
File "cnn_training_v3.py", line 108, in <module>
classifier.train(input_fn=train_input_fn, steps=200, hooks=[logging_hook])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 363, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 843, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 859, in _train_model_default
saving_listeners)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1059, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 567, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1043, in run
run_metadata=run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1134, in run
raise six.reraise(*original_exc_info)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1119, in run
return self._sess.run(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1199, in run
run_metadata=run_metadata))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/basic_session_run_hooks.py", line 623, in after_run
raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.
我知道在这个网站上已经有过类似的问题,但是他们的答案对我没有帮助。我已经尝试降低学习率,将epsilon添加到我的logits概率并更改损失函数但仍然出错。
这是我的CNN功能:
# CNN function
def cnn_model_fn(features, labels, mode):
# Define the layers of the cnn
input_layer = tf.reshape(features["images"], [-1, 200, 200, 3])
conv_layer = tf.layers.conv2d(inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu)
pool_layer = tf.layers.max_pooling2d(inputs=conv_layer, pool_size=[2, 2], strides=2)
conv_layer_two = tf.layers.conv2d(inputs=pool_layer, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu)
pool_layer_two = tf.layers.max_pooling2d(inputs=conv_layer_two, pool_size=[2, 2], strides=2)
flat_pool_two = tf.reshape(pool_layer_two, [-1, 50 * 50 * 64])
dense_layer = tf.layers.dense(inputs=flat_pool_two, units=1024, activation=tf.nn.relu)
logits = tf.layers.dense(inputs=dense_layer, units=4)
# Add epsilon to logits
epsilon = tf.constant(value=0.00001, shape=(1,4))
logits = logits + epsilon
# Generate predictions (for PREDICT and EVAL mode)
blocknum_prediction = tf.argmax(input=logits, axis=1)
blocknum_probabilities = tf.nn.softmax(logits, name="softmax_tensor")
predictions = {"blocknum_classes": blocknum_prediction}
# Return predictions when in PREDICT mode
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Calculate Loss (for both TRAIN and EVAL modes)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
# Configure the Training Operation (for TRAIN mode)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0001)
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
# Add evaluation metrics (for EVAL mode)
eval_metric_ops = {"blocknum_accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["blocknum_classes"])}
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
这是我的主要代码。我的目标是训练CNN查看一个块塔的图像,并预测图像中有多少块。
# Load and process dataset
image_files = []
text_files = []
images = []
labels = []
# load files from folder
for root, dirs, files in os.walk("images"):
for filename in files:
if 'before' in filename:
image_files.append(filename)
elif 'text' in filename:
text_files.append(filename)
# for each pair of files, append relevant data to image and label lists
# note to self: label 0 means 2 blocks, label 1 means 3 blocks, label 2 means 4 blocks, label 3 means 5 blocks
for imagename in image_files:
images.append(cv2.imread('images/'+filename))
num = imagename[7:len(imagename)-4]
for textname in text_files:
if ('_'+num+'.') in textname:
textfile = open('images/'+textname, 'r')
for line in textfile:
if 'Number of blocks' in line:
nblocks = int(line[18:].strip('\n'))
if nblocks == 2:
label = 0
elif nblocks == 3:
label = 1
elif nblocks == 4:
label = 2
elif nblocks == 5:
label = 3
labels.append(label)
# separate images and labels into train and test sets - 50% train, 50% evaluate
train_images = images[0:len(images)/2]
train_labels = labels[0:len(labels)/2]
test_images = images[len(images)/2:]
test_labels = labels[len(labels)/2:]
# convert dataset into numpy arrays
train_data_numpy = np.array(train_images, np.float32)
train_labels_numpy = np.array(train_labels, np.int32)
test_data_numpy = np.array(test_images, np.float32)
test_labels_numpy = np.array(test_labels, np.int32)
# Put images through CNN
# Create the Estimator
classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir="models/cnn")
# Set up logging for predictions
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1)
# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"images":train_data_numpy}, y=train_labels_numpy, batch_size=1, num_epochs=None, shuffle=True)
classifier.train(input_fn=train_input_fn, steps=200, hooks=[logging_hook])
# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"images":test_data_numpy}, y=test_labels_numpy, num_epochs=1, shuffle=False)
eval_results = classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)
我在Ubuntu 16.04上使用Python 2.7.12。任何有关NaN损失发生原因的见解都将不胜感激。
答案 0 :(得分:1)
找到解决方案!事实证明模型的先前检查点与当前的训练课程相冲突,所以我删除了模型保存检查点的文件夹中的所有内容,现在它的训练没有任何NaN丢失错误。
答案 1 :(得分:1)
除了answer provided by The Impossible Squish:如果您从未为TensorFlow指定目录model_dir
来保存检查点,那么您需要使用print (tempfile.gettempdir())
来查找TensorFlow默认将检查点保存到的位置(当然,您需要先import tempfile
)。在培训之前,请确保没有任何事情导致任何NaN误入学习者(这是我的原因)。这可能应该是评论,但我没有足够的代表来评论。