我正在关注https://www.tensorflow.org/tutorials/layers的CNN Mnist教程,以了解我的个人图像分类任务。我的输入图像大小是224 * 224 * 3,而不是教程中的28 * 28,我只有5个课而不是10个。我读过以前关于这个问题的帖子,很多人指出学习率太高或者使用太大cross_entropy_loss可能有问题,但我不确定这是不是这样。
当我开始训练时,我立即得到这个NaN损失训练错误:
ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
File "cnn_model.py", line 75, in <module>
main(sys.argv[1], sys.argv[2])
File "cnn_model.py", line 68, in main
classifier.train(input_fn = train_input_fn, steps = 2000, hooks = [logging_hook])
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 241, in train
loss = self._train_model(input_fn=input_fn, hooks=hooks)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 612, in _train_model
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 505, in run
run_metadata=run_metadata)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 842, in run
run_metadata=run_metadata)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 798, in run
return self._sess.run(*args, **kwargs)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 960, in run
run_metadata=run_metadata))
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\basic_session_run_hooks.py", line 477, in after_run
raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.
以下是型号代码:
import tensorflow as tf
from helper import load_data_and_label
import cv2
import sys
import math
def cnn_model_fn(features, labels, mode):
#input layer
input_layer = tf.reshape(features['x'], [-1, 224, 224, 3])
#conv layer 1
conv1 = tf.layers.conv2d(inputs = input_layer, filters = 32, kernel_size
= [5,5], padding = 'same', activation = tf.nn.relu)
#pooling layer 1
pool1 = tf.layers.max_pooling2d(inputs = conv1, pool_size = [2,2], strides = 2)
#conv2 and pool2 layers
conv2 = tf.layers.conv2d(inputs = pool1, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool2 = tf.layers.max_pooling2d(inputs = conv2, pool_size = [2,2], strides = 2)
#conv3 and pool3 layers
conv3 = tf.layers.conv2d(inputs = pool2, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool3 = tf.layers.max_pooling2d(inputs = conv3, pool_size = [2,2], strides = 2)
#conv4 and pool4 layers
conv4 = tf.layers.conv2d(inputs = pool3, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool4 = tf.layers.max_pooling2d(inputs = conv4, pool_size = [2,2], strides = 2)
#conv5 and pool5 layers
conv5 = tf.layers.conv2d(inputs = pool4, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool5 = tf.layers.max_pooling2d(inputs = conv5, pool_size = [2,2], strides = 2)
#dense layer
pool5_flat = tf.reshape(pool5, [-1, 7 * 7 * 64])
dense = tf.layers.dense(inputs = pool5_flat, units = 1024, activation = tf.nn.relu)
dropout = tf.layers.dropout(inputs = dense, rate = 0.5,
training = mode == tf.estimator.ModeKeys.TRAIN)
#logits layer
logits = tf.layers.dense(inputs = dropout, units = 5)
predictions = {"classes":tf.argmax(input = logits, axis = 1),
"prob": tf.nn.softmax(logits, name = 'softmax_tensor')}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode = mode, predictions = predictions)
#calculate loss
onehot_labels = tf.one_hot(indices = tf.cast(labels, tf.int32), depth = 5)
loss = tf.losses.softmax_cross_entropy(onehot_labels = onehot_labels, logits = logits)
#configure training operation
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.001)
train_op = optimizer.minimize(loss = loss, global_step = tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode = mode, loss = loss, train_op = train_op)
#evaluation metrics
eval_metrics_ops = {"accuracy": tf.metrics.accuracy(labels = labels, predictions = predictions["classes"])}
return tf.estimator.EstimatorSpec(mode = mode, loss = loss, eval_metrics_ops = eval_metrics_ops)
def main(imagepath, labelpath):
train_data, train_labels, eval_data, eval_labels = load_data_and_label(imagepath, labelpath)
classifier = tf.estimator.Estimator(model_fn = cnn_model_fn, model_dir = "/tmp/retina_convnet_model")
tensors_to_log = {"prob": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(tensors = tensors_to_log, every_n_iter = 50)
#train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(x = {"x":train_data}, y = train_labels,
batch_size = 32, num_epochs = None, shuffle = True)
classifier.train(input_fn = train_input_fn, steps = 2000, hooks = [logging_hook])
eval_input_fn = tf.estimator.inputs.numpy_input_fn(x = {"x":eval_data}, y = eval_labels, num_epochs = 1, shuffle = False)
eval_results = classifier.evaluate(input_fn = eval_input_fn)
print(eval_results)
if __name__ == "__main__":
main(sys.argv[1], sys.argv[2])
非常感谢你!任何帮助都会非常感激!
答案 0 :(得分:0)
您是否对图像进行了任何预处理?如果没有,那么可能尝试标准化帮助函数中的图像,看看是否有帮助。