我有5个HDF5文件,每个文件22 GB。每个HDF5文件都是一系列4801图像,尺寸为1920 x 1200。我需要从每个HDF5文件中加载相同的帧号,摆脱一些流氓像素,平均5张图像的堆栈,并在每个帧号处写入一个处理了图像的新HDF5文件。因为我无法一次加载所有5个HDF5文件而不用完RAM,所以我只能从每个HDF5文件中加载图像块,将每个帧号的5张图像放入队列,处理堆栈,然后写入生成的图像到HDF5文件。现在,我正在使用h5py执行对HDF5文件的任何读取/写入。
我想知道处理分块数据最有效的计算方式是什么?现在,我要指定一个处理器作为编写器,然后遍历某些数据块大小,为此我创建了多个使用方,将数据放入队列中,等待使用方完成,然后冲洗并重复直到所有图像均已处理。这意味着每次循环推进时,都会创建新的使用者流程-我想这会产生一些开销。下面是代码示例。
#!/usr/bin/env python
import time
import os
from multiprocessing import Process, Queue, JoinableQueue, cpu_count
import glob
import h5py
import numpy as np
'''Function definitions'''
# The consumer function takes data off of the Queue
def consumer(inqueue,output):
# Run indefinitely
while True:
# If the queue is empty, queue.get() will block until the queue has data
all_data = inqueue.get()
if all_data:
#n is the index corresponding to the projection location
n, image_data = all_data
#replace zingers with median and average stack
#Find the median for each pixel of the prefiltered image
med = np.median(image_data,axis=0)
#Loop through the image set
for j in range(image_data.shape[0]):
replicate = image_data[j,...]
mask = replicate - med > zinger_level
replicate[mask] = med[mask] # Substitute with median
image_data[j,...] = replicate # Put data back in place
out = np.mean(image_data,axis=0,dtype=np.float32).astype(np.uint16)
output.put((n,out))
else:
break
#function for writing out HDF5 file
def write_hdf(output,output_filename):
#create output HDF5 file
while True:
args = output.get()
if args:
i,data = args
with h5py.File(output_filename,'a') as fout:
fout['Prefiltered_images'][i,...] = data
else:
break
def fprocess_hdf_stack(hdf_filenames,output_filename):
file_list = []
for fname in hdf_filenames:
file_list.append(h5py.File(fname,'r'))
#process chunks of data so that we don't run out of memory
totsize = h5py.File(hdf_filenames[0],'r')['exchange']['data'].shape[0]
data_shape = h5py.File(hdf_filenames[0],'r')['exchange']['data'].shape
fout.create_dataset('Prefiltered_images',data_shape,dtype=np.uint16)
fout.close()
ints = range(totsize)
chunkSize= 100
#initialize how many consumers we would like working
num_consumers = cpu_count()*2
#Create the Queue objects
inqueue = JoinableQueue()
output = Queue()
#start process for writing HDF5 file
proc = Process(target=write_hdf, args=(output,output_filename))
proc.start()
print("Loading %i images into memory..."%chunkSize)
for i in range(0,totsize,chunkSize):
time0 = time.time()
chunk = ints[i:i+chunkSize]
data_list = []
#Make a list of the HDF5 datasets we are reading in
for files in file_list:
#shape is (angles, rows, columns)
data_list.append(files['exchange']['data'][chunk,...])
data_list = np.asarray(data_list)
print("Elapsed time to load images %i-%i is %0.2f minutes." %(chunk[0],chunk[-1],(time.time() - time0)/60))
consumers = []
#Create consumer processes
for i in range(num_consumers):
p = Process(target=consumer, args=(inqueue,output))
consumers.append(p)
p.start()
for n in range(data_list.shape[1]):
#Feed data into the queue
inqueue.put((chunk[n],data_list[:,n,...]))
#Kill all of the processes when everything is finished
for i in range(num_consumers):
inqueue.put(None)
for c in consumers:
c.join()
print("Elapsed time to process images %i-%i is %0.2f minutes." %(chunk[0],chunk[-1],(time.time() - time0)/60))
time.sleep(1)
output.put(None)
proc.join()
#Close the input HDF5 files.
for hdf_file in file_list:
hdf_file.close()
print("Input HDF5 files closed.")
return
if __name__ == '__main__':
start_time = time.time()
raw_images_filenames = glob.glob(raw_images_dir + raw_images_basename)
tempname = os.path.basename(raw_images_filenames[0]).split('.')[0]
tempname_split = tempname.split('_')[:-1]
output_filename = output_dir+'_'.join(tempname_split) + '_Prefiltered.hdf5'
fprocess_hdf_stack(raw_images_filenames,output_filename)
print("Elapsed time is %0.2f minutes" %((time.time() - start_time)/60))
我认为我的瓶颈实际上不在图像加载中。它是初始化用户并在每个帧数上对5张图像进行处理。我一直在尝试将使用者函数从for循环中删除,但是我不知道如何为此设置内存上限,以免出现RAM不足的情况。谢谢!