Tensorlow:打开的文件过多

时间:2018-02-26 09:27:06

标签: multithreading python-3.x tensorflow

我的代码结构如下 - 我有一个主文件,它在循环中调用两个函数 - >

1- function1()(存在于单独的文件中)从文件夹中读取图像文件说"输入"对图像进行一些转换并将这些转换后的图像保存到文件夹中,然后输出"输出"

2- function2()训练神经网络以映射"输入"文件夹到"输出"文件夹

主要功能存在于mainfile.py中,function1()存在于单独的文件中,而function2()存在于不同的文件中(因此总共有三个文件)

def function1():
images = reader.image(FLAGS.BATCH_SIZE, FLAGS.HEIGHT, FLAGS.WIDTH, FLAGS.TRAIN_IMAGES_PATH, subtract_mean=True)
\\loss function stuff
         with tf.Session(config=config) as sess:
                    sess.run(init_op)
                    coord = tf.train.Coordinator()
                    threads = tf.train.start_queue_runners(coord=coord)

                    start_time = time.time()
                    batches_count = 0
                    try:
                        while not coord.should_stop():
                            writer = tf.summary.FileWriter(FLAGS.summary_path, sess.graph)
                            sess.run([batch_assign_op,initial_assign_op])
                            batch_time = time.time()
                            for step in range(FLAGS.NUM_ITERATIONS):
                                    _, loss_t = sess.run(
                                        [train_op, total_loss])

                            summary_str = sess.run(summary)
                            writer.add_summary(summary_str, step)
                            image_t = sess.run(jpegs)
                            for i in range(image_t.shape[0]):
                                filename = '%s/%d.jpg' % (FLAGS.outputdir, (batches_count * FLAGS.BATCH_SIZE + i + 1))
                                with open(filename, 'wb') as f:
                                    f.write(image_t[i])
                                    f.close()
                            batches_count = batches_count + 1
                            after = process.memory_percent()
                            print("MEMORY CHANGE %.4f -> %.4f" % (before, after))
                            before = after
                    except tf.errors.OutOfRangeError:
                        print("final time elspased", (time.time() - start_time))
                        print('Done doing non paramteric part')
                    finally:
                        coord.request_stop()
                    coord.join(threads)

接下来是函数2的定义

def function2():
tf.reset_default_graph()
    run_id = FLAGS.MODEL_NAME if FLAGS.MODEL_NAME else str(uuid.uuid4())
    model_path = '%s/%s' % (FLAGS.MODEL_DIR, run_id)
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    images, target_ig = reader.net_batch(FLAGS.BATCH_SIZE, FLAGS.HEIGHT, FLAGS.WIDTH, FLAGS.TRAIN_IMAGES_PATH,
                                         FLAGS.TARGET_PATH, epochs=FLAGS.EPOCHS, subtract_mean=False,zero_one=False
                                         ,input_mean=FLAGS.input_mean,target_mean=FLAGS.output_mean)
    ae_inputs = tf.placeholder(tf.float32, (None, FLAGS.HEIGHT, FLAGS.WIDTH, 3),
                               name='auto_input')  # input to the network (MNIST images)
    target = tf.placeholder(tf.float32, (None, FLAGS.HEIGHT, FLAGS.WIDTH, 3),
                            name='target')

    ae_output = model.net(ae_inputs, training=True)
    learning_rate = tf.placeholder(tf.float32, shape=[], name='learning_rate')

    #ogit_output = tf.nn.sigmoid(ae_output)
   # loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=logit_output,name="p_loss"))

    size = tf.size(ae_output)
    loss = tf.nn.l2_loss((target - ae_output), name="p_loss")/tf.to_float(size)

    global_step = tf.Variable(FLAGS.gs_val, name="p_global_step", trainable=False)
    train_op = tf.train.AdamOptimizer(learning_rate, name="p_trainopt").minimize(loss, global_step=global_step)


    # Statistics
    average_pl = tf.placeholder(tf.float32,shape=[])
    with tf.name_scope('losses'):
        tf.summary.scalar('total loss', loss)
        tf.summary.image('param generated', (ae_output))
        tf.summary.image('original', (ae_inputs))
        tf.summary.image('target', (target))
        tf.summary.scalar("average_loss", average_pl)
    summary = tf.summary.merge_all()
    init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    with tf.Session(config=config) as sess:
        writer = tf.summary.FileWriter(FLAGS.summary_path, sess.graph)
        saver = tf.train.Saver(tf.trainable_variables())
        file = tf.train.latest_checkpoint(model_path)
        sess.run(init_op)
        if file:
            print('Restoring model from {}'.format(file))
            saver.restore(sess, file)

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        start_time = time.time()
        start_ti = time.time()
        acc_loss = []
        count =1
        try:
            while not coord.should_stop():
                ig_b, tg_b  =sess.run([images,target_ig])
                if count % FLAGS.record == 0:
                    _, loss_t, step, summary_str = sess.run([train_op, loss, global_step, summary],
                                                            feed_dict={ae_inputs: ig_b,
                                                                       target: tg_b,
                                                                       learning_rate: FLAGS.LEARNING_RATE,
                                                                       average_pl:(sum(acc_loss)/len(acc_loss))})
                    print(step,"--> loss: ", loss_t," ,average loss: ",(sum(acc_loss)/len(acc_loss)),
                                            " ,elpased time: ", elapsed_time)
                    if FLAGS.writesum:
                        writer.add_summary(summary_str, step)

                    if count % FLAGS.chanelr == 0:
                        FLAGS.LEARNING_RATE = FLAGS.LEARNING_RATE / FLAGS.div
                        print('learning rate now: ', FLAGS.LEARNING_RATE)
                    acc_loss = []
                else:
                    _, loss_t, step = sess.run([train_op, loss, global_step],
                                                       feed_dict={ae_inputs: ig_b,
                                                                  target: tg_b,
                                                                learning_rate: FLAGS.LEARNING_RATE})
                    acc_loss.append(loss_t)
                    elapsed_time = time.time() - start_time
                    start_time = time.time()
                count = count+1

        except tf.errors.OutOfRangeError:
            print(step, loss_t, elapsed_time)
            saver.save(sess, model_path + '/style-model',global_step=step)
            print("final time elspased", (time.time() - start_ti))
            print('Done training -- epoch limit reached')
        finally:
            coord.request_stop()
        coord.join(threads)
    return step

以下是批量读取图像的功能 -

def image(batch, height, width, path, epochs=1, shuffle=False, subtract_mean = False):

    # TODO: Find a proper way to do this
    ll = [int(i.split('.')[0]) for i in listdir(path)]
    ll.sort()
    filenames = [join(path, '%d.jpg' %(f)) for f in ll  if isfile(join(path, '%d.jpg' %(f)))]

    png = filenames[0].lower().endswith('png')  # If first file is a png, assume they all are

    filename_queue = tf.train.string_input_producer(filenames,
                                                    shuffle=shuffle, num_epochs=epochs)

    reader = tf.WholeFileReader()
    _, img_bytes = reader.read(filename_queue)
    image = tf.image.decode_png(img_bytes, channels=3)\
        if png else tf.image.decode_jpeg(img_bytes, channels=3)

    processed_image = preprocess(image, height,width, subtract_mean=subtract_mean)
    return tf.train.batch([processed_image], batch)

def net_batch(batch, height, width, path_input,path_target, epochs=1, zero_one= True, shuffle=False,
              subtract_mean = False,input_mean=None, target_mean=None):

    #TODO: Find a proper way to do this
    ll_in = [int(i.split('.')[0]) for i in listdir(path_input)]
    ll_in.sort()
    filenames_input = [join(path_input, '%d.jpg' % (f)) for f in ll_in if isfile(join(path_input, '%d.jpg' % (f)))]
    #filenames_input = [join(path_input, f) for f in listdir(path_input) if isfile(join(path_input, f))]

    png_input = filenames_input[0].lower().endswith('png')  # If first file is a png, assume they all are

    ll_out = [int(i.split('.')[0]) for i in listdir(path_target)]
    ll_out.sort()
    filenames_target = [join(path_target, '%d.jpg' % (f)) for f in ll_out if isfile(join(path_target, '%d.jpg' % (f)))]
   # filenames_target = [join(path_target, f) for f in listdir(path_target) if isfile(join(path_target, f))]


    png_output = filenames_target[0].lower().endswith('png')  # If first file is a png, assume they all are

    file_queue = tf.train.slice_input_producer([filenames_input,filenames_target],shuffle=True, num_epochs=epochs)

    input_file = tf.read_file(file_queue[0])
    input_image = tf.image.decode_png(input_file, channels=3)\
        if png_input else tf.image.decode_jpeg(input_file, channels=3)
    processed_image_input = preprocess(input_image, height,width, subtract_mean=subtract_mean,zero_one=zero_one,
                                       net=True,meandata=input_mean)

    target_file = tf.read_file(file_queue[1])
    target_image = tf.image.decode_png(target_file, channels=3) \
        if png_output else tf.image.decode_jpeg(target_file, channels=3)
    processed_image_target = preprocess(target_image, height, width, subtract_mean=subtract_mean,zero_one=zero_one,
                                        net=True,meandata=target_mean)

    return tf.train.batch([processed_image_input,processed_image_target], batch)

问题在几次运行循环之间调用function1()和 function2()整个代码崩溃,给出以下错误 -

Traceback (most recent call last):                                             
  File "/usr/lib/python3/dist-packages/apport_python_hook.py", line 63, in appo  
rt_excepthook                                                                  
  File "<frozen importlib._bootstrap>", line 969, in _find_and_load            
  File "<frozen importlib._bootstrap>", line 958, in _find_and_load_unlocked  
  File "<frozen importlib._bootstrap>", line 673, in _load_unlocked            
  File "<frozen importlib._bootstrap_external>", line 661, in exec_module      
  File "<frozen importlib._bootstrap_external>", line 766, in get_code          
  File "<frozen importlib._bootstrap_external>", line 818, in get_data          
OSError: [Errno 24] Too many open files: '/usr/lib/python3/dist-packages/apport 
/__init__.py'                                                                    

Original exception was:                                                         
Traceback (most recent call last):                                              
  File "Mainfile.py", line 157, in <module>                                       
  File "Mainfile.py", line 131, in main                                          
  File "/home/suryabhan/Desktop/New_NST_MAC/slowNST.py", line 246, in Nonpapram
etric                                                                           
  File "/home/suryabhan/Desktop/New_NST_MAC/slowNST.py", line 214, in create_st 
yleimage                                                                        
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coord
inator.py", line 389, in join                                                   
  File "/home/suryabhan/.local/lib/python3.5/site-packages/six.py", line 686, i
n reraise                                                                      
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/queue
_runner_impl.py", line 238, in _run                                            
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session
.py", line 1235, in _single_operation_run                                       
  File "/usr/lib/python3.5/contextlib.py", line 66, in __exit__                
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/erro
rs_impl.py", line 466, in raise_exception_on_not_ok_status                     
tensorflow.python.framework.errors_impl.ResourceExhaustedError: Input/76834.jpg
         [[Node: ReaderReadV2 = ReaderReadV2[_device="/job:localhost/replica:0/
task:0/cpu:0"](WholeFileReaderV2, input_producer)]]                             
         [[Node: Assert_2/Assert/_72 = _Recv[client_terminated=false, recv_devi
ce="/job:localhost/replica:0/task:0/gpu:0", send_device="/job:localhost/replica 
:0/task:0/cpu:0", send_device_incarnation=1, tensor_name="edge_157_Assert_2/Ass
ert", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"]()]
]

我花了好几个小时试图修复这个问题,但是无处接近,我的理论是函数1()中的线程即使在调用coord.request_stop()之后也不会停止,这会继续累积开放线程并最终导致代码崩溃。即使这是原因,我也不知道如何解决它。有人可以帮我这里。 我已尽力以正确的方式解决问题,请不要低估这个问题。我很乐意提供您要求解决上述问题的任何信息。

0 个答案:

没有答案