我正在使用tensorflow的对象检测API,每当我执行训练时,它都会在几次迭代后停止。最初,我使用jpg格式的图像来创建XML文件,然后将其转换为CSV,但是人们提到错误的原因可能是使用jpg而不是jpeg(尽管其他人已经将其以jpg格式工作)。然后,我将图像转换为jpeg并执行了其余步骤,然后进行了培训,并出现了同样的问题。我一直在这个问题上停留了很长时间,但无济于事,而且似乎没有很多可行的解决方案。如果有人有解决这个问题的想法,我将非常感激。下面的代码
Instructions for updating:
Please switch to tf.train.get_or_create_global_step
WARNING:root:Variable [Conv/biases/Momentum] is not available in checkpoint
WARNING:root:Variable [Conv/weights/Momentum] is not available in checkpoint
WARNING:root:Variable [FirstStageBoxPredictor/BoxEncodingPredictor/biases/Momentum] is not available in checkpoint
WARNING:root:Variable [FirstStageBoxPredictor/BoxEncodingPredictor/weights/Momentum] is not available in checkpoint
....
INFO:tensorflow:global step 1: loss = 1.6760 (13.660 sec/step)
INFO:tensorflow:global step 1: loss = 1.6760 (13.660 sec/step)
INFO:tensorflow:Finished training! Saving model to disk.
INFO:tensorflow:Finished training! Saving model to disk.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/summary/writer/writer.py:386: UserWarning: Attempting to use a closed FileWriter. The operation will be a noop unless the FileWriter is explicitly reopened.
warnings.warn("Attempting to use a closed FileWriter. "
Traceback (most recent call last):
File "train.py", line 185, in <module>
tf.app.run()
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/platform/app.py", line 125, in run
_sys.exit(main(argv))
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 324, in new_func
return func(*args, **kwargs)
File "train.py", line 181, in main
graph_hook_fn=graph_rewriter_fn)
File "/usr/local/lib/python3.6/dist-packages/object_detection-0.1-py3.6.egg/object_detection/legacy/trainer.py", line 416, in train
saver=saver)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/slim/python/slim/learning.py", line 785, in train
ignore_live_threads=ignore_live_threads)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/supervisor.py", line 832, in stop
ignore_live_threads=ignore_live_threads)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/coordinator.py", line 389, in join
six.reraise(*self._exc_info_to_raise)
File "/usr/local/lib/python3.6/dist-packages/six.py", line 693, in reraise
raise value
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/queue_runner_impl.py", line 257, in _run
enqueue_callable()
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1257, in _single_operation_run
self._call_tf_sessionrun(None, {}, [], target_list, None)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1407, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Shape mismatch in tuple component 18. Expected [1,?,?,3], got [1,1,314,384,3]
[[{{node batch/padding_fifo_queue_enqueue}}]]
Train.py
"""Training executable for detection models.
This executable is used to train DetectionModels. There are two ways of
configuring the training job:
1) A single pipeline_pb2.TrainEvalPipelineConfig configuration file
can be specified by --pipeline_config_path.
Example usage:
./train \
--logtostderr \
--train_dir=path/to/train_dir \
--pipeline_config_path=pipeline_config.pbtxt
2) Three configuration files can be provided: a model_pb2.DetectionModel
configuration file to define what type of DetectionModel is being trained, an
input_reader_pb2.InputReader file to specify what training data will be used and
a train_pb2.TrainConfig file to configure training parameters.
Example usage:
./train \
--logtostderr \
--train_dir=path/to/train_dir \
--model_config_path=model_config.pbtxt \
--train_config_path=train_config.pbtxt \
--input_config_path=train_input_config.pbtxt
"""
#changed object_detection.builders/legacy/utils to builders...
import functools
import json
import os
import tensorflow as tf
from builders import dataset_builder
from builders import graph_rewriter_builder
from builders import model_builder
from legacy import trainer
from utils import config_util
tf.logging.set_verbosity(tf.logging.INFO)
flags = tf.app.flags
flags.DEFINE_string('master', '', 'Name of the TensorFlow master to use.')
flags.DEFINE_integer('task', 0, 'task id')
flags.DEFINE_integer('num_clones', 1, 'Number of clones to deploy per worker.')
flags.DEFINE_boolean('clone_on_cpu', False,
'Force clones to be deployed on CPU. Note that even if '
'set to False (allowing ops to run on gpu), some ops may '
'still be run on the CPU if they have no GPU kernel.')
flags.DEFINE_integer('worker_replicas', 1, 'Number of worker+trainer '
'replicas.')
flags.DEFINE_integer('ps_tasks', 0,
'Number of parameter server tasks. If None, does not use '
'a parameter server.')
flags.DEFINE_string('train_dir', '',
'Directory to save the checkpoints and training summaries.')
flags.DEFINE_string('pipeline_config_path', '',
'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
'file. If provided, other configs are ignored')
flags.DEFINE_string('train_config_path', '',
'Path to a train_pb2.TrainConfig config file.')
flags.DEFINE_string('input_config_path', '',
'Path to an input_reader_pb2.InputReader config file.')
flags.DEFINE_string('model_config_path', '',
'Path to a model_pb2.DetectionModel config file.')
FLAGS = flags.FLAGS
@tf.contrib.framework.deprecated(None, 'Use object_detection/model_main.py.')
def main(_):
assert FLAGS.train_dir, '`train_dir` is missing.'
if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
if FLAGS.pipeline_config_path:
configs = config_util.get_configs_from_pipeline_file(
FLAGS.pipeline_config_path)
if FLAGS.task == 0:
tf.gfile.Copy(FLAGS.pipeline_config_path,
os.path.join(FLAGS.train_dir, 'pipeline.config'),
overwrite=True)
else:
configs = config_util.get_configs_from_multiple_files(
model_config_path=FLAGS.model_config_path,
train_config_path=FLAGS.train_config_path,
train_input_config_path=FLAGS.input_config_path)
if FLAGS.task == 0:
for name, config in [('model.config', FLAGS.model_config_path),
('train.config', FLAGS.train_config_path),
('input.config', FLAGS.input_config_path)]:
tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name),
overwrite=True)
model_config = configs['model']
train_config = configs['train_config']
input_config = configs['train_input_config']
model_fn = functools.partial(
model_builder.build,
model_config=model_config,
is_training=True)
def get_next(config):
return dataset_builder.make_initializable_iterator(
dataset_builder.build(config)).get_next()
create_input_dict_fn = functools.partial(get_next, input_config)
env = json.loads(os.environ.get('TF_CONFIG', '{}'))
cluster_data = env.get('cluster', None)
cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
task_data = env.get('task', None) or {'type': 'master', 'index': 0}
task_info = type('TaskSpec', (object,), task_data)
# Parameters for a single worker.
ps_tasks = 0
worker_replicas = 1
worker_job_name = 'lonely_worker'
task = 0
is_chief = True
master = ''
if cluster_data and 'worker' in cluster_data:
# Number of total worker replicas include "worker"s and the "master".
worker_replicas = len(cluster_data['worker']) + 1
if cluster_data and 'ps' in cluster_data:
ps_tasks = len(cluster_data['ps'])
if worker_replicas > 1 and ps_tasks < 1:
raise ValueError('At least 1 ps task is needed for distributed training.')
if worker_replicas >= 1 and ps_tasks > 0:
# Set up distributed training.
server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
job_name=task_info.type,
task_index=task_info.index)
if task_info.type == 'ps':
server.join()
return
worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
task = task_info.index
is_chief = (task_info.type == 'master')
master = server.target
graph_rewriter_fn = None
if 'graph_rewriter_config' in configs:
graph_rewriter_fn = graph_rewriter_builder.build(
configs['graph_rewriter_config'], is_training=True)
trainer.train(
create_input_dict_fn,
model_fn,
train_config,
master,
task,
FLAGS.num_clones,
worker_replicas,
FLAGS.clone_on_cpu,
ps_tasks,
worker_job_name,
is_chief,
FLAGS.train_dir,
graph_hook_fn=graph_rewriter_fn)
if __name__ == '__main__':
tf.app.run()
答案 0 :(得分:0)
此行应给您一个提示:Expected [1,?,?,3], got [1,1,314,384,3]
Tensorflow使用4D张量作为模型的图像输入,这就是为什么期望尺寸为[1,?,?,3]
的张量的原因。但是,您提供了5D张量。我猜想,某个地方的代码中有很多tf.expand_dims()
。
答案 1 :(得分:0)
对于任何遇到此问题的人,请检查您的火车并测试CSV文件,以查看是否存在宽度和高度均为0的条目。通常,如果图像的格式与扩展名不同,则通常会发生这种情况。通过删除这些图像或使用-
将其转换为正确的格式来解决该问题img = cv2.imread(test_full_path)
cv2.imwrite(test_full_path, img, [int(cv2.IMWRITE_JPEG_QUALITY), 100])