应用错误收集

在getitem

时间：2018-10-15 10:05:07

标签： python tensorflow keras multiprocessing

我从实验中获得了很多数据，希望用来训练keras模型。我考虑过使用keras.utils.sequence生成器，以便与模型的GPU训练并行进行所有预处理。因此，我编写了一个生成器，可以随机打开实验中的对应文件，并在每个 getitem 调用中生成一个批处理。不使用索引。这是代码：

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 18 13:07:52 2018

@author: niklas
A generator based on keras sequence class to get noisy data on the fly
"""

import os, sys
import numpy as np
from tensorflow.python.keras.utils import Sequence

from numpy.random import randint
import imageio

from normalize import normalize_from_uint16_data

class noise_generator_real(Sequence):
    def __init__(
            self, base_dir, batch_size, images_per_epoch,
            z_pixels, y_pixels, x_pixels,
            norm_data=normalize_from_uint16_data,
            percentile_noise=99.5,
            percentile_boost=99.9
            ):
        self.base_dir = base_dir
        self.batch_size = batch_size
        self.images_per_epoch = images_per_epoch

        # no preprocessing of pics, so only load one pic and gut it
        self.patches_per_img = batch_size
        self.shape = (z_pixels, y_pixels, x_pixels, 1)

        self.percentile_noise=percentile_noise
        self.percentile_boost=percentile_boost

        self.norm_data = norm_data

#        self.calls_to_getitem = 0
#        self.epoch = 0 

        self.delete_indexes = []

        self.filenames = []
        for file in os.listdir(os.path.join(self.base_dir, "boost")):
            if file.endswith(".tif"):
                split = file.split("_")
                split[1] = "ch2"
                file_noise = "_".join(split)
                self.filenames.append([
                        os.path.join(self.base_dir, "boost", file),
                        os.path.join(self.base_dir, "noise", file_noise)
                        ])

#        print(len(self.filenames))


    def on_epoch_end(self):
        pass
        # TODO or TOFINNDOUT:
        # WHY is every worker killed on epoch end
#        self.epoch += 1
#        print("\n\n\nEpoch {} is done!!!\n\n\n".format(self.epoch))

    def __len__(self):
        return int(
                np.ceil(self.images_per_epoch / float(self.batch_size))
                )

    def __getitem__(self, index):
        x_noise = np.empty((self.batch_size, *self.shape))
        y_boost = np.empty((self.batch_size, *self.shape))

        x_pixel = self.shape[2]
        y_pixel = self.shape[1]
        z_pixel = self.shape[0]

        i = 0
        while i < self.batch_size:
            [filename_boost, filename_noise] = self.filenames[
                    randint(len(self.filenames))
                    ]
            # load boost gt
            boost = imageio.volread(filename_boost).astype(np.float32)

            # load noise image            
            noise = imageio.volread(filename_noise).astype(np.float32)

            # Calculate variables for normalization
            percentile_boost = np.percentile(boost, self.percentile_boost)
            minimum_boost = np.min(boost)
            percentile_noise = np.percentile(noise, self.percentile_noise)
            minimum_noise = np.min(noise)

            x_len = boost.shape[2]
            y_len = boost.shape[1]

            x_range = int(x_len) - x_pixel
            y_range = int(y_len) - y_pixel

            j = 0
            while j < self.patches_per_img and i < self.batch_size:
                # Get offsets           
                x_start = randint(x_range)
                y_start = randint(y_range)

                x_end = x_start + x_pixel
                y_end = y_start + y_pixel

                z_start = 0
                z_end = z_pixel

                # Cropping data
                tmp_boost = boost[z_start:z_end, y_start:y_end, x_start:x_end]

                # is this helpfull timeintensive???
                # seems to work
                if not np.any(tmp_boost):
                    continue

                tmp_noise = noise[z_start:z_end, y_start:y_end, x_start:x_end]

#                print("tmp_noise: ", tmp_noise.shape)
#                sys.stdout.flush()

                x_noise[i,...] = self.norm_data(
                        np.reshape(tmp_noise, self.shape),
                        percentile_noise, minimum_noise
                        )
                y_boost[i,...] = self.norm_data(
                        np.reshape(tmp_boost, self.shape),
                        percentile_boost, minimum_boost
                        )
                j += 1
                i += 1
#        print("\n\nfinished one batch\n\n")
#        self.calls_to_getitem += 1
#        print(self.calls_to_getitem)
#        sys.stdout.flush()
        return x_noise, y_boost

我使用以下代码片段开始培训过程：

generator = noise_generator_real(
        trainingdir, batchsize, pics_per_epoch,
        z_pixel, y_pixel, x_pixel
        )

workers = 5
self.model.fit_generator(
        generator, epochs=input_epochs,
        verbose=1, callbacks=callbacks,
        initial_epoch=(self.epoch + 1),
        use_multiprocessing=True,
        workers=workers, max_queue_size=10
        )

这经常有效，GPU利用率达到100％。某些进程显示100％的CPU利用率，表明预处理正常。但是在某些情况下，GPU利用率会降至0％。所有CPU进程也下降到0％。当我使用gdb python查看进程时，出现一些futex死锁错误：

使用主机libthread_db库“ /lib/x86_64-linux-gnu/libthread_db.so.1”。

位于../sysdeps/unix/sysv/linux/futex-internal.h:205的futex_abstimed_wait_cancelable中的0x00007f03fa9f1827（私有= 0，abstime = 0x0，预期= 0，futex_word = 0x1f7b8fe0）。

205 ../sysdeps/unix/sysv/linux/futex-internal.h：没有这样的文件或目录。

我意识到我无法调试此设置，因为记录或打印任何信息都会触发死锁。

在生成器的早期版本中，我为批次中的每个图像打开了一个随机的新实验文件。在那种情况下，我经常会遇到这种futex死锁。因此，我认为当两个不同的进程尝试打开同一文件时，就会出现死锁。这可能是正确的假设吗？如何避免这种死锁？

0 个答案:

没有答案