大型HDF5文件的多处理

时间:2018-06-19 17:00:26

标签: python-multiprocessing hdf5 large-data

我有5个HDF5文件,每个文件22 GB。每个HDF5文件都是一系列4801图像,尺寸为1920 x 1200。我需要从每个HDF5文件中加载相同的帧号,摆脱一些流氓像素,平均5张图像的堆栈,并在每个帧号处写入一个处理了图像的新HDF5文件。因为我无法一次加载所有5个HDF5文件而不用完RAM,所以我只能从每个HDF5文件中加载图像块,将每个帧号的5张图像放入队列,处理堆栈,然后写入生成的图像到HDF5文件。现在,我正在使用h5py执行对HDF5文件的任何读取/写入。

我想知道处理分块数据最有效的计算方式是什么?现在,我要指定一个处理器作为编写器,然后遍历某些数据块大小,为此我创建了多个使用方,将数据放入队列中,等待使用方完成,然后冲洗并重复直到所有图像均已处理。这意味着每次循环推进时,都会创建新的使用者流程-我想这会产生一些开销。下面是代码示例。

#!/usr/bin/env python

import time
import os
from multiprocessing import Process, Queue, JoinableQueue, cpu_count
import glob
import h5py
import numpy as np

'''Function definitions''' 

# The consumer function takes data off of the Queue
def consumer(inqueue,output):    
    # Run indefinitely
    while True:         
        # If the queue is empty, queue.get() will block until the queue has data
        all_data = inqueue.get()
        if all_data:
            #n is the index corresponding to the projection location 
            n, image_data = all_data
            #replace zingers with median and average stack 
            #Find the median for each pixel of the prefiltered image
            med = np.median(image_data,axis=0)
            #Loop through the image set
            for j in range(image_data.shape[0]):
                replicate = image_data[j,...]
                mask = replicate - med > zinger_level
                replicate[mask] = med[mask] # Substitute with median
                image_data[j,...] = replicate # Put data back in place
            out = np.mean(image_data,axis=0,dtype=np.float32).astype(np.uint16)
            output.put((n,out))
        else:
            break

#function for writing out HDF5 file        
def write_hdf(output,output_filename):
    #create output HDF5 file
    while True:
        args = output.get()
        if args:
            i,data = args
            with h5py.File(output_filename,'a') as fout:
                fout['Prefiltered_images'][i,...] = data
        else:
            break

def fprocess_hdf_stack(hdf_filenames,output_filename):
    file_list = []
    for fname in hdf_filenames:
        file_list.append(h5py.File(fname,'r'))
    #process chunks of data so that we don't run out of memory
    totsize = h5py.File(hdf_filenames[0],'r')['exchange']['data'].shape[0]
    data_shape =  h5py.File(hdf_filenames[0],'r')['exchange']['data'].shape
    fout.create_dataset('Prefiltered_images',data_shape,dtype=np.uint16)
    fout.close()
    ints = range(totsize)
    chunkSize= 100

    #initialize how many consumers we would like working 
    num_consumers = cpu_count()*2

    #Create the Queue objects
    inqueue = JoinableQueue()
    output = Queue()     

    #start process for writing HDF5 file
    proc = Process(target=write_hdf, args=(output,output_filename))
    proc.start()

    print("Loading %i images into memory..."%chunkSize)
    for i in range(0,totsize,chunkSize):
        time0 = time.time()
        chunk = ints[i:i+chunkSize]
        data_list = []
        #Make a list of the HDF5 datasets we are reading in
        for files in file_list:
            #shape is (angles, rows, columns)
            data_list.append(files['exchange']['data'][chunk,...])
        data_list = np.asarray(data_list)
        print("Elapsed time to load images %i-%i is %0.2f minutes." %(chunk[0],chunk[-1],(time.time() - time0)/60))

        consumers = []       

        #Create consumer processes
        for i in range(num_consumers):
            p = Process(target=consumer, args=(inqueue,output))
            consumers.append(p)
            p.start()

        for n in range(data_list.shape[1]):
            #Feed data into the queue
            inqueue.put((chunk[n],data_list[:,n,...]))

        #Kill all of the processes when everything is finished    
        for i in range(num_consumers):
            inqueue.put(None)

        for c in consumers:
            c.join()
        print("Elapsed time to process images %i-%i is %0.2f minutes." %(chunk[0],chunk[-1],(time.time() - time0)/60))

    time.sleep(1)
    output.put(None)
    proc.join()     

    #Close the input HDF5 files.
    for hdf_file in file_list:
        hdf_file.close()
    print("Input HDF5 files closed.") 
    return

if __name__ == '__main__':  
    start_time = time.time()
    raw_images_filenames = glob.glob(raw_images_dir + raw_images_basename)
    tempname = os.path.basename(raw_images_filenames[0]).split('.')[0]
    tempname_split = tempname.split('_')[:-1]
    output_filename = output_dir+'_'.join(tempname_split) + '_Prefiltered.hdf5'
    fprocess_hdf_stack(raw_images_filenames,output_filename)
    print("Elapsed time is %0.2f minutes" %((time.time() - start_time)/60))

我认为我的瓶颈实际上不在图像加载中。它是初始化用户并在每个帧数上对5张图像进行处理。我一直在尝试将使用者函数从for循环中删除,但是我不知道如何为此设置内存上限,以免出现RAM不足的情况。谢谢!

0 个答案:

没有答案