Question

因此，我抓住了一个example with an Image Classification，并尝试将其更改为适合我的数据的类型。我有2类（但将来可能会更多），例如“汽车”和“其他”（其他意思是其他）。每个图像都是200x200 RGB，我在这里遇到问题：

我的损失将一直是 NAN ，与纪元的数目无关。

代码

from _future_ import absolute_import
from _future_ import division
from _future_ import print_function

import time
import math
import numpy as np
from PIL import Image
import tensorflow as tf
import os

# Basic model parameters as external flags.
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.')
flags.DEFINE_integer('max_steps', 2000, 'Number of steps to run trainer.')
flags.DEFINE_integer('hidden1', 128, 'Number of units in hidden layer 1.')
flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.')
flags.DEFINE_integer('batch_size', 128, 'Batch size.  '
                                        'Must divide evenly into the dataset sizes.')
flags.DEFINE_string('train_dir', os.path.abspath("ModelData"), 'Directory to put the training data.')
flags.DEFINE_boolean('fake_data', False, 'If true, uses fake data '
                                         'for unit testing.')
NUM_CLASSES = 2
IMAGE_SIZE = 200
CHANNELS = 3
IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE * CHANNELS


def inference(images, hidden1_units, hidden2_units):
    # Hidden 1
    with tf.name_scope('hidden1'):
        weights = tf.Variable(
            tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
                                stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
            name='weights')
        biases = tf.Variable(tf.zeros([hidden1_units]),
                             name='biases')
        hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
    # Hidden 2
    with tf.name_scope('hidden2'):
        weights = tf.Variable(
            tf.truncated_normal([hidden1_units, hidden2_units],
                                stddev=1.0 / math.sqrt(float(hidden1_units))),
            name='weights')
        biases = tf.Variable(tf.zeros([hidden2_units]),
                             name='biases')
        hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
    # Linear
    with tf.name_scope('softmax_linear'):
        weights = tf.Variable(
            tf.truncated_normal([hidden2_units, NUM_CLASSES],
                                stddev=1.0 / math.sqrt(float(hidden2_units))),
            name='weights')
        biases = tf.Variable(tf.zeros([NUM_CLASSES]),
                             name='biases')
        logits = tf.matmul(hidden2, weights) + biases
    return logits


def cal_loss(logits, labels):
    labels = tf.to_int64(labels)
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels, name='xentropy')
    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
    return loss


def training(loss, learning_rate):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    global_step = tf.Variable(0, name='global_step', trainable=False)
    train_op = optimizer.minimize(loss, global_step=global_step)
    return train_op


def evaluation(logits, labels):
    correct = tf.nn.in_top_k(logits, labels, 1)
    return tf.reduce_sum(tf.cast(correct, tf.int32))


def placeholder_inputs(batch_size):
    images_placeholder = tf.placeholder(tf.float32, shape=(batch_size, IMAGE_PIXELS))
    labels_placeholder = tf.placeholder(tf.int32, shape=batch_size)
    return images_placeholder, labels_placeholder


def fill_feed_dict(images_feed, labels_feed, images_pl, labels_pl):
    feed_dict = {
        images_pl: images_feed,
        labels_pl: labels_feed,
    }
    return feed_dict


def do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_set):
    # And run one epoch of eval.
    true_count = 0  # Counts the number of correct predictions.
    steps_per_epoch = 128 // FLAGS.batch_size
    num_examples = steps_per_epoch * FLAGS.batch_size
    for step in range(steps_per_epoch):
        feed_dict = fill_feed_dict(train_images, train_labels, images_placeholder, labels_placeholder)
        true_count += sess.run(eval_correct, feed_dict=feed_dict)
    precision = true_count / num_examples
    print('  Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' % (num_examples, true_count, precision))


# Get the sets of images and labels for training, validation, and
def init_training_data_set(dir):
    train_images = []
    train_labels = []

    def GetFoldersList():
        mylist = []
        filelist = os.listdir(dir)
        for name in filelist:
            if os.path.isdir(os.path.join(dir, name)):
                mylist.append(name)
        return mylist

    def ReadImagesFromFolder(folder):
        fin_dir = os.path.join(dir, folder)
        images_name = os.listdir(fin_dir)
        images = []
        for img_name in images_name:
            img_location = os.path.join(dir, folder)
            final_loc = os.path.join(img_location, img_name)
            try:
                import hashlib
                hash_folder = int(hashlib.md5(folder.encode()).hexdigest(), 16) % (10 ** 8 )
                images.append((np.array(Image.open(final_loc).convert('RGB')), hash_folder))
            except:
                pass
        return images

    folders = GetFoldersList()
    for folder in folders:
        for imgs in ReadImagesFromFolder(folder):
            train_images.append(imgs[0])
            train_labels.append(imgs[1])
    return train_images, train_labels


train_images, train_labels = init_training_data_set(os.path.join("FetchData", "Image"))
train_images = np.array(train_images)
train_images = train_images.reshape(len(train_images), IMAGE_PIXELS)

train_labels = np.array(train_labels)


def run_training():
    # Tell TensorFlow that the model will be built into the default Graph.
    with tf.Graph().as_default():
        # Generate placeholders for the images and labels.
        images_placeholder, labels_placeholder = placeholder_inputs(len(train_images))

        # Build a Graph that computes predictions from the inference model.
        logits = inference(images_placeholder, FLAGS.hidden1, FLAGS.hidden2)

        # Add to the Graph the Ops for loss calculation.
        loss = cal_loss(logits, labels_placeholder)

        # Add to the Graph the Ops that calculate and apply gradients.
        train_op = training(loss, FLAGS.learning_rate)

        # Add the Op to compare the logits to the labels during evaluation.
        eval_correct = evaluation(logits, labels_placeholder)

        # Create a saver for writing training checkpoints.
        saver = tf.train.Saver()

        # Create a session for running Ops on the Graph.
        sess = tf.Session()

        # Run the Op to initialize the variables.
        init = tf.global_variables_initializer()
        sess.run(init)

        # And then after everything is built, start the training loop.
        for step in range(FLAGS.max_steps):
            start_time = time.time()
            feed_dict = fill_feed_dict(train_images, train_labels, images_placeholder, labels_placeholder)
            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
            duration = time.time() - start_time

            print("Current step is: " + str(step))
            print("Current los value: " + str(loss_value))
            print("Current duration: " + str(duration))
            print("\n")

            if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                saver.save(sess, FLAGS.train_dir, global_step=step)
                print('Training Data Eval:')
                do_eval(sess, eval_correct, images_placeholder, labels_placeholder, train_images)


def main(_):
    run_training()


if _name_ == '_main_':
    tf.app.run()

此外，我应该提到我已经搜索了此损失NAN ，发现与学习率应该有一些共同点，所以我做了一些尝试，但一无所获已更改（我必须将其从0.1更改为0.0001，什么也没有...）。

因此，如果您有任何想法，我该如何解决此问题，以及如何优化或简化它，请告诉我。

Answer 1

上一篇文章：

NaN from sparse_softmax_cross_entropy_with_logits in Tensorflow

可能是您的标签超出范围了？

Answer 2

这是因为您的标签超出范围，

hash_folder = int(hashlib.md5(folder.encode()).hexdigest(), 16) % (10 ** 8 )

此代码返回的标签超出[0,2）范围。在我的设置中，您的代码给出了错误，但如果我们将标签映射到[0,2），就可以正常工作。

Tensorflow图像清晰度损失NAN

代码

2 个答案: