使用h5py保存到hdf5文件会随着时间的推移而变慢

时间:2018-08-29 09:19:56

标签: python multiprocessing h5py

我想在hdf5文件中存储> 1M图像。问题是将它们保存在for循环中会随着时间的流逝而变慢。从150万张图像的80小时开始,到处理6K图像后的半小时,该时间下降到130小时。 数据以元组列表的形式出现。对于每个图像,都有一个包含id和url的元组。 download_image函数检查id是否在hdf5 id_dset中并下载图像。如果需要,save_image函数将hdf5 image_dset和hdf5 id_dset递增,并保存id和图像。最后,所有这些都进入了一个多处理池。

def download_image(id_url):    
    (id, url) = id_url

    if id in id_dset:
        log_file.write('Image {} already exists. Skipping download\n'.format(id))
        return None  

    try:
        response = request.urlopen(url)
        image_data = response.read()
    except:
        log_file.write('Could not download image {} from {}\n'.format(id, url))
        return None

    try:
        image = Image.open(BytesIO(image_data))        
    except:
        log_file.write('Failed to parse image {}\n'.format(id))
        return None

    try:
        image = image.convert('RGB')
    except:
        log_file.write('Failed to convert image {} to RGB\n'.format(id))
        return None

    return id, image

def save_image(idx, id, image):
    try:
        if image_dset.shape[0] <= idx:
            image_dset.resize(idx+1000, axis=0)
            id_dset.resize(idx+1000, axis=0)

        image_dset[idx] = np.array(image)
        id_dset[idx] = id

    except:
        log_file.write('Failed to save image {}\n'.format(id))

# Create an empty file
chunk_size = 10
input_dir = '../input/images-h5/'

if not os.path.exists(input_dir):
    os.makedirs(input_dir)

with h5py.File(input_dir + 'images.h5', 'w') as f:

    image_dset = f.create_dataset(name='image', shape=(0, 400, 400, 3), maxshape=(None, 400, 400, 3), dtype=np.uint8, 
                               chunks=(chunk_size, 400, 400, 3), compression='gzip', compression_opts=3)

    dt = h5py.special_dtype(vlen=str)
    id_dset = f.create_dataset(name='id', shape=(0,), maxshape=(None,), dtype=dt)

from multiprocessing.pool import ThreadPool
pool = ThreadPool(6)

log_file = open(input_dir + 'log_{}.txt'.format(datetime.datetime.now().isoformat()), 'w')

# Add to file
with h5py.File(input_dir + 'images.h5', 'r+', libver='latest') as f:

    image_dset = f['image']
    id_dset = f['id']

    if id_dset.shape[0] == 0:
        idx = 0        
    else: 
        idx = np.argmax(id_dset[:] == '')

    for res in tqdm.tqdm_notebook(pool.imap(download_image, id_url_list), total=len(id_url_list)):
        if res is not None:
            id, image = res

            save_image(idx, id, image)   

            idx += 1

log_file.close()

0 个答案:

没有答案