我甚至无法在一个训练循环中运行我的代码,它停留在sess.run或training_op.run函数(永远运行代码......)。我不知道这个错误在哪里。
samples_all, labels_all = getsamples()
上面的代码加载数据集。 samples_all是包含图像路径的列表。图像的大小为240 * 320 * 3. labels_all是一个包含密集类的列表。有101个班级。 我进入sess.run函数,发现它进入_do_call函数,并执行fn(* args)。但是,它永远不会返回,也不会捕获任何异常。
import pickle
import re
import random
import numpy as np
import tensorflow as tf
from tensorflow.contrib import *
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
def vgg16Net(inputs,
num_classes=1000,
is_training=True,
dropout_keep_prob=0.5,
spatial_squeeze=True,
scope='vgg_16'):
with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
end_points_collection = sc.name + '_end_points'
# Collect outputs for conv2d, fully_connected and max_pool2d
with framework.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
outputs_collections=end_points_collection):
net = layers.repeat(inputs, 2, layers.conv2d, 64, [3, 3], scope='conv1')
net = layers.max_pool2d(net, [2, 2], scope='pool1')
net = layers.repeat(net, 2, layers.conv2d, 128, [3, 3], scope='conv2')
net = layers.max_pool2d(net, [2, 2], scope='pool2')
net = layers.repeat(net, 3, layers.conv2d, 256, [3, 3], scope='conv3')
net = layers.max_pool2d(net, [2, 2], scope='pool3')
net = layers.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv4')
net = layers.max_pool2d(net, [2, 2], scope='pool4')
net = layers.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv5')
net = layers.max_pool2d(net, [2, 2], scope='pool5')
# Use conv2d instead of fully_connected layers.
net = layers.conv2d(net, 4096, [7, 10], padding='VALID', scope='fc6')
net = layers.dropout(net, dropout_keep_prob, is_training=is_training,
scope='dropout6')
net = layers.conv2d(net, 4096, [1, 1], scope='fc7')
net = layers.dropout(net, dropout_keep_prob, is_training=is_training,
scope='dropout7')
net = layers.conv2d(net, num_classes, [1, 1],
activation_fn=None,
normalizer_fn=None,
scope='fc8')
# Convert end_points_collection into a end_point dict.
end_points = layers.utils.convert_collection_to_dict(end_points_collection)
if spatial_squeeze:
net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
end_points[sc.name + '/fc8'] = net
return net, end_points
def getsamples():
rootpath = 'C:\\Users\\mx\\Desktop\\nextLevel\\UCF-101'
with open('res.pickle', 'rb') as f:
pathdict = pickle.load(f)
with open('trainlist01.txt', 'r') as f:
lines=f.readlines()
samples_all=[]
labels_all=[]
for line in lines:
[videogroup, video, label]=re.split('/| |\n', line)[0:3]
samples = [rootpath + '\\' + videogroup + '\\' + video[:-4] + '\\' + i for i in pathdict[videogroup][video]]
samples_all.extend(samples)
labels_all.extend([label]*len(samples))
return samples_all, labels_all
samples_all, labels_all = getsamples()
numOfSamples = len(samples_all)
labels_one_hot_all = np.zeros((len(labels_all), 101))
index_offset = np.arange(len(labels_all))*101
ind = index_offset + np.array(labels_all, np.int32) - 1
labels_one_hot_all.flat[ind]=1
#samples_all = tf.constant(samples_all)
#labels_all = tf.constant(labels_all)
#samples_all_p = tf.placeholder(dtype = tf.string, shape = (numOfSamples,) )
#labels_one_hot_all_p = tf.placeholder(dtype = tf.float32, shape = (numOfSamples, 101))
#samples_all_v = tf.Variable(np.asarray(['']*numOfSamples), name = 'sample', trainable = False)
#labels_one_hot_all_v = tf.Variable(np.zeros_like(labels_one_hot_all, dtype=np.float32), name = 'label', trainable = False)
[sample, label] = tf.train.slice_input_producer([samples_all, labels_one_hot_all])
imagecontent = tf.read_file(sample)
image = tf.image.decode_jpeg(imagecontent, channels=3)
image = tf.cast(image, dtype = tf.float32)
channels = tf.split(2, 3, image)
channels[0] -= _R_MEAN
channels[1] -= _G_MEAN
channels[2] -= _B_MEAN
image=tf.concat(2, channels)
image=tf.reshape(image, [240, 320, 3])
images, labels = tf.train.batch([image, label], 16, 3, 32)
net, end = vgg16Net(images, num_classes = 101, is_training=True)
losses.softmax_cross_entropy(net, labels)
total_loss = losses.get_total_loss()
global_step = tf.Variable(0, trainable = False, name = 'global_step')
starter_learning_rate = 0.1
learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
100000, 0.96, staircase=True)
train_var = framework.get_variables_to_restore(exclude = ['vgg_16/conv1', 'vgg_16/conv2', 'vgg_16/conv2', 'vgg_16/conv3', 'vgg_16/conv4', 'vgg_16/conv5', 'global_step', 'sample', 'label']);
init_var = framework.get_variables_to_restore(exclude = ['vgg_16/fc6', 'vgg_16/fc7', 'vgg_16/fc8', 'global_step', 'sample', 'label'])
init_op, feed_init = framework.assign_from_checkpoint('./vgg_16.ckpt', init_var)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
training_op = optimizer.minimize(total_loss, global_step, var_list=train_var)
#with tf.name_scope('accuracy'):
# with tf.name_scope('correct_prediction'):
# correct_prediction = tf.equal(tf.argmax(net, 1), tf.argmax(labels, 1))
# with tf.name_scope('accuracy'):
# accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
#summaries
model_store_dir = 'C:\\Users\\mx\\Desktop\\nextLevel\\nextLevel\\nextLevel\\log1\\'
summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
#summaries.add(tf.summary.scalar('accuracy', accuracy))
for end_point in end:
x=end[end_point]
summaries.add(tf.summary.histogram('activations/' + end_point, x))
summaries.add(tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x)))
for loss in tf.get_collection(tf.GraphKeys.LOSSES):
summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))
for variable in framework.get_model_variables():
summaries.add(tf.summary.histogram(variable.op.name, variable))
summaries.add(tf.summary.scalar('learning_rate', learning_rate))
summaries.add(tf.summary.scalar('total_loss', total_loss))
#summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES))
summary_op = tf.summary.merge(list(summaries))
summary_writer = tf.summary.FileWriter(model_store_dir)
with tf.Session() as sess:
tf.global_variables_initializer().run()
sess.run(init_op, feed_dict=feed_init)
for i in range(100000):
if i % 100 == 99:
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
s, _ = sess.run([summary_op, training_op],
options=run_options,
run_metadata=run_metadata)
summary_writer.add_run_metadata(run_metadata, 'step%03d' % i)
summary_writer.add_summary(s, i)
print('Adding run metadata for', i)
else:
training_op.run()
s=summary_op.run()
summary_writer.add_summary(s, i)
s, _ = sess.run([summary_op, training_op])
答案 0 :(得分:6)
TL; DR:您需要在运行init_op
后以及开始训练循环之前添加tf.train.start_queue_runners(sess)
。
tf.train.batch()
函数使用TensorFlow队列将输入数据累积到批处理中。这些队列由后台线程填充,后台线程在您调用tf.train.start_queue_runners()
时创建。如果您不调用此方法,后台线程将无法启动,队列将保持为空,并且训练操作将无限期地阻止等待输入。
答案 1 :(得分:3)
尝试使用tf.train.MonitoredTrainingSession
来完成队列运行器的所有初始化和启动。