我想在hdf5文件中存储> 1M图像。问题是将它们保存在for循环中会随着时间的流逝而变慢。从150万张图像的80小时开始,到处理6K图像后的半小时,该时间下降到130小时。 数据以元组列表的形式出现。对于每个图像,都有一个包含id和url的元组。 download_image函数检查id是否在hdf5 id_dset中并下载图像。如果需要,save_image函数将hdf5 image_dset和hdf5 id_dset递增,并保存id和图像。最后,所有这些都进入了一个多处理池。
def download_image(id_url):
(id, url) = id_url
if id in id_dset:
log_file.write('Image {} already exists. Skipping download\n'.format(id))
return None
try:
response = request.urlopen(url)
image_data = response.read()
except:
log_file.write('Could not download image {} from {}\n'.format(id, url))
return None
try:
image = Image.open(BytesIO(image_data))
except:
log_file.write('Failed to parse image {}\n'.format(id))
return None
try:
image = image.convert('RGB')
except:
log_file.write('Failed to convert image {} to RGB\n'.format(id))
return None
return id, image
def save_image(idx, id, image):
try:
if image_dset.shape[0] <= idx:
image_dset.resize(idx+1000, axis=0)
id_dset.resize(idx+1000, axis=0)
image_dset[idx] = np.array(image)
id_dset[idx] = id
except:
log_file.write('Failed to save image {}\n'.format(id))
# Create an empty file
chunk_size = 10
input_dir = '../input/images-h5/'
if not os.path.exists(input_dir):
os.makedirs(input_dir)
with h5py.File(input_dir + 'images.h5', 'w') as f:
image_dset = f.create_dataset(name='image', shape=(0, 400, 400, 3), maxshape=(None, 400, 400, 3), dtype=np.uint8,
chunks=(chunk_size, 400, 400, 3), compression='gzip', compression_opts=3)
dt = h5py.special_dtype(vlen=str)
id_dset = f.create_dataset(name='id', shape=(0,), maxshape=(None,), dtype=dt)
from multiprocessing.pool import ThreadPool
pool = ThreadPool(6)
log_file = open(input_dir + 'log_{}.txt'.format(datetime.datetime.now().isoformat()), 'w')
# Add to file
with h5py.File(input_dir + 'images.h5', 'r+', libver='latest') as f:
image_dset = f['image']
id_dset = f['id']
if id_dset.shape[0] == 0:
idx = 0
else:
idx = np.argmax(id_dset[:] == '')
for res in tqdm.tqdm_notebook(pool.imap(download_image, id_url_list), total=len(id_url_list)):
if res is not None:
id, image = res
save_image(idx, id, image)
idx += 1
log_file.close()