Question

我在GPU上运行Tensorflow 0.12.1。我有一个训练有素的深度CNN模型，我使用检查点文件保存了它的权重。在推理期间，我使用restorer.restore(sess, tf.train.latest_checkpoint(FLAGS.train_dir))重新加载已保存的检查点。代码似乎没有问题，但每次重新运行脚本时，我都会搞砸输出。 AFAIK，我不会改变我的测试集输入。输入正在加载并正确地馈送到网络。只是在相同测试集上使用相同顺序的CNN的不同运行的输出产生非常不同的输出。我很困惑！此外，如何在推理期间执行加载了已保存检查点的图表而不运行init_op？看来我的代码需要在执行之前初始化所有全局变量和局部变量。（我先初始化，然后只恢复检查点！）。这是我的代码片段：

import tensorflow as tf
import numpy as np
import os
import os.path
from datetime import datetime
import time
import random
import json

from tensorflow.python.framework import ops
from tensorflow.python.framework import dtypes


from modelFCNN3 import model 

def read_input(inp_queue,height=224,width=224,channels=3, mask=False):
  value = tf.read_file(inp_queue)
  image = tf.image.decode_png(value)
  image =  tf.image.resize_images(image, [height, width],method=2)
  image = tf.cast(image, tf.uint8)
  image.set_shape([height,width,channels])
  image = tf.reshape(image,[height,width,channels])
  if mask:
      image = tf.to_float(tf.greater_equal(image,128))
      image = tf.cast(image,tf.float32)
  else:
      image = tf.image.per_image_standardization(image)
      image = tf.cast(image,tf.float32)
  return image




if __name__ == '__main__':

    tf.reset_default_graph()

    with open('X_test.json', 'r') as infile:
        X_test = json.load(infile)

    with open('y_test.json', 'r') as infile:
        y_test = json.load(infile)

    imagelist = ops.convert_to_tensor(X_test, dtype=dtypes.string)
    labellist = ops.convert_to_tensor(y_test, dtype=dtypes.string)

    input_queue = tf.train.slice_input_producer([imagelist, labellist],
                                            num_epochs=1,
                                            shuffle=False)

    image = read_input(input_queue[0],height=224,width=224,channels=3, mask=False)

    label = read_input(input_queue[1],height=224,width=224,channels=1, mask=True)

    images_batch, labels_batch = tf.train.batch([image, label], batch_size=FLAGS.batch_size,
        enqueue_many=False,shapes=None, allow_smaller_final_batch=True)

    global_step = tf.Variable(0, trainable=False)
    images = tf.placeholder_with_default(images_batch, shape=[None, 224,224,3])
    labels = tf.placeholder_with_default(labels_batch, shape=[None, 224,224,1])

    restorer = tf.train.Saver()

    logits = model(images).logits
    labels = tf.cast(labels,tf.int32)
    labels.set_shape([FLAGS.batch_size,224,224,1])

    valid_prediction = tf.argmax(tf.nn.softmax(logits), dimension=3)
    valid_prediction.set_shape([FLAGS.batch_size,224,224])

    meanIOU,update_op_mIOU= tf.contrib.metrics.streaming_mean_iou(tf.cast(valid_prediction,tf.int32), tf.squeeze(labels),FLAGS.num_classes)

    init = tf.global_variables_initializer()
    init_locals = tf.local_variables_initializer()


    with tf.Session() as sess:

        sess.run([init, init_locals])

        restorer.restore(sess, tf.train.latest_checkpoint(FLAGS.train_dir))
        print("Model restored.")


        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord,sess=sess)
        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        try:
            step = 0
            avg = []
            while not coord.should_stop():
                myimg, predimg, mylbl= sess.run([images,valid_prediction,labels])
                mIOU,_ = sess.run([meanIOU,update_op_mIOU])
                avg.append(mIOU)

            step += 1

        except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')

        finally:

            coord.request_stop()
            coord.join(threads)
            sess.close()

Answer 1

您是在同一台机器上运行还是在不同的机器上运行 #saver = tf.train.Saver（）

以下评论在tensorflow文档中 #NOTE：仅当设备分配未更改时，才能从已保存的meta_graph重新启动培训。 #saver = tf.train.import_meta_graph（metafile）

使用保存的检查点的多轮推理模型产生随机误差 - Tensorflow

1 个答案: