我的代码结构如下 - 我有一个主文件,它在循环中调用两个函数 - >
1- function1()(存在于单独的文件中)从文件夹中读取图像文件说"输入"对图像进行一些转换并将这些转换后的图像保存到文件夹中,然后输出"输出"
2- function2()训练神经网络以映射"输入"文件夹到"输出"文件夹
主要功能存在于mainfile.py中,function1()存在于单独的文件中,而function2()存在于不同的文件中(因此总共有三个文件)
def function1():
images = reader.image(FLAGS.BATCH_SIZE, FLAGS.HEIGHT, FLAGS.WIDTH, FLAGS.TRAIN_IMAGES_PATH, subtract_mean=True)
\\loss function stuff
with tf.Session(config=config) as sess:
sess.run(init_op)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
start_time = time.time()
batches_count = 0
try:
while not coord.should_stop():
writer = tf.summary.FileWriter(FLAGS.summary_path, sess.graph)
sess.run([batch_assign_op,initial_assign_op])
batch_time = time.time()
for step in range(FLAGS.NUM_ITERATIONS):
_, loss_t = sess.run(
[train_op, total_loss])
summary_str = sess.run(summary)
writer.add_summary(summary_str, step)
image_t = sess.run(jpegs)
for i in range(image_t.shape[0]):
filename = '%s/%d.jpg' % (FLAGS.outputdir, (batches_count * FLAGS.BATCH_SIZE + i + 1))
with open(filename, 'wb') as f:
f.write(image_t[i])
f.close()
batches_count = batches_count + 1
after = process.memory_percent()
print("MEMORY CHANGE %.4f -> %.4f" % (before, after))
before = after
except tf.errors.OutOfRangeError:
print("final time elspased", (time.time() - start_time))
print('Done doing non paramteric part')
finally:
coord.request_stop()
coord.join(threads)
接下来是函数2的定义
def function2():
tf.reset_default_graph()
run_id = FLAGS.MODEL_NAME if FLAGS.MODEL_NAME else str(uuid.uuid4())
model_path = '%s/%s' % (FLAGS.MODEL_DIR, run_id)
if not os.path.exists(model_path):
os.makedirs(model_path)
images, target_ig = reader.net_batch(FLAGS.BATCH_SIZE, FLAGS.HEIGHT, FLAGS.WIDTH, FLAGS.TRAIN_IMAGES_PATH,
FLAGS.TARGET_PATH, epochs=FLAGS.EPOCHS, subtract_mean=False,zero_one=False
,input_mean=FLAGS.input_mean,target_mean=FLAGS.output_mean)
ae_inputs = tf.placeholder(tf.float32, (None, FLAGS.HEIGHT, FLAGS.WIDTH, 3),
name='auto_input') # input to the network (MNIST images)
target = tf.placeholder(tf.float32, (None, FLAGS.HEIGHT, FLAGS.WIDTH, 3),
name='target')
ae_output = model.net(ae_inputs, training=True)
learning_rate = tf.placeholder(tf.float32, shape=[], name='learning_rate')
#ogit_output = tf.nn.sigmoid(ae_output)
# loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=logit_output,name="p_loss"))
size = tf.size(ae_output)
loss = tf.nn.l2_loss((target - ae_output), name="p_loss")/tf.to_float(size)
global_step = tf.Variable(FLAGS.gs_val, name="p_global_step", trainable=False)
train_op = tf.train.AdamOptimizer(learning_rate, name="p_trainopt").minimize(loss, global_step=global_step)
# Statistics
average_pl = tf.placeholder(tf.float32,shape=[])
with tf.name_scope('losses'):
tf.summary.scalar('total loss', loss)
tf.summary.image('param generated', (ae_output))
tf.summary.image('original', (ae_inputs))
tf.summary.image('target', (target))
tf.summary.scalar("average_loss", average_pl)
summary = tf.summary.merge_all()
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
with tf.Session(config=config) as sess:
writer = tf.summary.FileWriter(FLAGS.summary_path, sess.graph)
saver = tf.train.Saver(tf.trainable_variables())
file = tf.train.latest_checkpoint(model_path)
sess.run(init_op)
if file:
print('Restoring model from {}'.format(file))
saver.restore(sess, file)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
start_time = time.time()
start_ti = time.time()
acc_loss = []
count =1
try:
while not coord.should_stop():
ig_b, tg_b =sess.run([images,target_ig])
if count % FLAGS.record == 0:
_, loss_t, step, summary_str = sess.run([train_op, loss, global_step, summary],
feed_dict={ae_inputs: ig_b,
target: tg_b,
learning_rate: FLAGS.LEARNING_RATE,
average_pl:(sum(acc_loss)/len(acc_loss))})
print(step,"--> loss: ", loss_t," ,average loss: ",(sum(acc_loss)/len(acc_loss)),
" ,elpased time: ", elapsed_time)
if FLAGS.writesum:
writer.add_summary(summary_str, step)
if count % FLAGS.chanelr == 0:
FLAGS.LEARNING_RATE = FLAGS.LEARNING_RATE / FLAGS.div
print('learning rate now: ', FLAGS.LEARNING_RATE)
acc_loss = []
else:
_, loss_t, step = sess.run([train_op, loss, global_step],
feed_dict={ae_inputs: ig_b,
target: tg_b,
learning_rate: FLAGS.LEARNING_RATE})
acc_loss.append(loss_t)
elapsed_time = time.time() - start_time
start_time = time.time()
count = count+1
except tf.errors.OutOfRangeError:
print(step, loss_t, elapsed_time)
saver.save(sess, model_path + '/style-model',global_step=step)
print("final time elspased", (time.time() - start_ti))
print('Done training -- epoch limit reached')
finally:
coord.request_stop()
coord.join(threads)
return step
以下是批量读取图像的功能 -
def image(batch, height, width, path, epochs=1, shuffle=False, subtract_mean = False):
# TODO: Find a proper way to do this
ll = [int(i.split('.')[0]) for i in listdir(path)]
ll.sort()
filenames = [join(path, '%d.jpg' %(f)) for f in ll if isfile(join(path, '%d.jpg' %(f)))]
png = filenames[0].lower().endswith('png') # If first file is a png, assume they all are
filename_queue = tf.train.string_input_producer(filenames,
shuffle=shuffle, num_epochs=epochs)
reader = tf.WholeFileReader()
_, img_bytes = reader.read(filename_queue)
image = tf.image.decode_png(img_bytes, channels=3)\
if png else tf.image.decode_jpeg(img_bytes, channels=3)
processed_image = preprocess(image, height,width, subtract_mean=subtract_mean)
return tf.train.batch([processed_image], batch)
def net_batch(batch, height, width, path_input,path_target, epochs=1, zero_one= True, shuffle=False,
subtract_mean = False,input_mean=None, target_mean=None):
#TODO: Find a proper way to do this
ll_in = [int(i.split('.')[0]) for i in listdir(path_input)]
ll_in.sort()
filenames_input = [join(path_input, '%d.jpg' % (f)) for f in ll_in if isfile(join(path_input, '%d.jpg' % (f)))]
#filenames_input = [join(path_input, f) for f in listdir(path_input) if isfile(join(path_input, f))]
png_input = filenames_input[0].lower().endswith('png') # If first file is a png, assume they all are
ll_out = [int(i.split('.')[0]) for i in listdir(path_target)]
ll_out.sort()
filenames_target = [join(path_target, '%d.jpg' % (f)) for f in ll_out if isfile(join(path_target, '%d.jpg' % (f)))]
# filenames_target = [join(path_target, f) for f in listdir(path_target) if isfile(join(path_target, f))]
png_output = filenames_target[0].lower().endswith('png') # If first file is a png, assume they all are
file_queue = tf.train.slice_input_producer([filenames_input,filenames_target],shuffle=True, num_epochs=epochs)
input_file = tf.read_file(file_queue[0])
input_image = tf.image.decode_png(input_file, channels=3)\
if png_input else tf.image.decode_jpeg(input_file, channels=3)
processed_image_input = preprocess(input_image, height,width, subtract_mean=subtract_mean,zero_one=zero_one,
net=True,meandata=input_mean)
target_file = tf.read_file(file_queue[1])
target_image = tf.image.decode_png(target_file, channels=3) \
if png_output else tf.image.decode_jpeg(target_file, channels=3)
processed_image_target = preprocess(target_image, height, width, subtract_mean=subtract_mean,zero_one=zero_one,
net=True,meandata=target_mean)
return tf.train.batch([processed_image_input,processed_image_target], batch)
问题在几次运行循环之间调用function1()和 function2()整个代码崩溃,给出以下错误 -
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/apport_python_hook.py", line 63, in appo
rt_excepthook
File "<frozen importlib._bootstrap>", line 969, in _find_and_load
File "<frozen importlib._bootstrap>", line 958, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 673, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 661, in exec_module
File "<frozen importlib._bootstrap_external>", line 766, in get_code
File "<frozen importlib._bootstrap_external>", line 818, in get_data
OSError: [Errno 24] Too many open files: '/usr/lib/python3/dist-packages/apport
/__init__.py'
Original exception was:
Traceback (most recent call last):
File "Mainfile.py", line 157, in <module>
File "Mainfile.py", line 131, in main
File "/home/suryabhan/Desktop/New_NST_MAC/slowNST.py", line 246, in Nonpapram
etric
File "/home/suryabhan/Desktop/New_NST_MAC/slowNST.py", line 214, in create_st
yleimage
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coord
inator.py", line 389, in join
File "/home/suryabhan/.local/lib/python3.5/site-packages/six.py", line 686, i
n reraise
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/queue
_runner_impl.py", line 238, in _run
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session
.py", line 1235, in _single_operation_run
File "/usr/lib/python3.5/contextlib.py", line 66, in __exit__
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/erro
rs_impl.py", line 466, in raise_exception_on_not_ok_status
tensorflow.python.framework.errors_impl.ResourceExhaustedError: Input/76834.jpg
[[Node: ReaderReadV2 = ReaderReadV2[_device="/job:localhost/replica:0/
task:0/cpu:0"](WholeFileReaderV2, input_producer)]]
[[Node: Assert_2/Assert/_72 = _Recv[client_terminated=false, recv_devi
ce="/job:localhost/replica:0/task:0/gpu:0", send_device="/job:localhost/replica
:0/task:0/cpu:0", send_device_incarnation=1, tensor_name="edge_157_Assert_2/Ass
ert", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"]()]
]
我花了好几个小时试图修复这个问题,但是无处接近,我的理论是函数1()中的线程即使在调用coord.request_stop()
之后也不会停止,这会继续累积开放线程并最终导致代码崩溃。即使这是原因,我也不知道如何解决它。有人可以帮我这里。
我已尽力以正确的方式解决问题,请不要低估这个问题。我很乐意提供您要求解决上述问题的任何信息。