在图像上并行应用(高斯)滤波器 - MATLAB

时间:2015-03-25 21:37:16

标签: matlab image-processing parallel-processing

我创建了以下(MATLAB)函数,以在图像上应用高斯滤镜模糊:

function [ mBlurredImage ] = ApplyGaussianBlur( mInputImage, gaussianKernelStd, stdToRadiusFactor )

gaussianBlurRadius  = ceil(stdToRadiusFactor * gaussianKernelStd); % Imitating Photoshop - See Reference

vGaussianKernel = exp(-([-gaussianBlurRadius:gaussianBlurRadius] .^ 2) / (2 * gaussianKernelStd * gaussianKernelStd));
vGaussianKernel = vGaussianKernel / sum(vGaussianKernel);

mInputImagePadded   = padarray(mInputImage, [gaussianBlurRadius, gaussianBlurRadius], 'replicate', 'both');

mBlurredImage = conv2(vGaussianKernel, vGaussianKernel.', mInputImagePadded, 'valid');


end

我正在尝试找到创建它的并行版本的最佳方法 我想找到一个适用于OpenMP的方法/策略。

我尝试填充图像,然后将其分成4个部分并在每个部分上应用模糊 然后我收集了所有碎片 这是代码:

function [ mBlurredImage ] = ApplyGaussianBlurParallel( mInputImage, gaussianKernelStd, stdToRadiusFactor, numThreads )

numRows = size(mInputImage, 1);
numCols = size(mInputImage, 2);

% mBlurredImage = zeros(numRows, numCols);

gaussianKernelRadius  = ceil(stdToRadiusFactor * gaussianKernelStd); % Imitating Photoshop - See Reference

vGaussianKernel = exp(-([-gaussianKernelRadius:gaussianKernelRadius] .^ 2) / (2 * gaussianKernelStd * gaussianKernelStd));
vGaussianKernel = vGaussianKernel / sum(vGaussianKernel);

numRowsPadded = numRows + (2 * gaussianKernelRadius);
numColsPadded = numCols + (2 * gaussianKernelRadius);

mInputImagePadded   = padarray(mInputImage, [gaussianKernelRadius, gaussianKernelRadius], 'replicate', 'both');

vColIdxImageBlock = round(linspace(1, numCols, (numThreads + 1)));

vFirstColIdxImageBlock = vColIdxImageBlock(1:numThreads);
% Going form Image Axis to Padded Image Axis
vFirstColIdxImagePaddedBlock = vFirstColIdxImageBlock + gaussianKernelRadius;
% Adding Pixels to the left
vFirstColIdxImagePaddedBlock = vFirstColIdxImagePaddedBlock - gaussianKernelRadius;

vLastColIdxImageBlock = vColIdxImageBlock(2:(numThreads + 1));
% Going form Image Axis to Padded Image Axis
vLastColIdxImagePaddedBlock = vLastColIdxImageBlock + gaussianKernelRadius;
% Adding Pixels to the right
vLastColIdxImagePaddedBlock = vLastColIdxImagePaddedBlock + gaussianKernelRadius;

vRowsIdxImage = 1:numRows;
vRowsImagePadded = 1:numRowsPadded;

cImageBlock             = cell(numThreads, 1);
cImageBlockProcessed    = cell(numThreads, 1);

for iBlockIdx = 1:numThreads
    firstColIdxImagePaddedBlock    = vFirstColIdxImagePaddedBlock(iBlockIdx);
    lastColIdxImagePaddedBlock     = vLastColIdxImagePaddedBlock(iBlockIdx);
    vColsIdxImagePadded = [firstColIdxImagePaddedBlock:lastColIdxImagePaddedBlock];

    cImageBlock{iBlockIdx} =  mInputImagePadded(vRowsImagePadded, vColsIdxImagePadded);
end

parfor iBlockIdx = 1:numThreads
   cImageBlockProcessed{iBlockIdx} = conv2(vGaussianKernel, vGaussianKernel.', cImageBlock{iBlockIdx}, 'valid');
end

mBlurredImage = zeros(numRows, numCols);

for iBlockIdx = 1:numThreads
    firstColIdxImageBlock    = vFirstColIdxImageBlock(iBlockIdx);
    lastColIdxImageBlock     = vLastColIdxImageBlock(iBlockIdx);
    vColsIdxImage = [firstColIdxImageBlock:lastColIdxImageBlock];

    mBlurredImage(vRowsIdxImage, vColsIdxImage) = cImageBlockProcessed{iBlockIdx};
end


end

我还创建了以下脚本来分析性能:

% `ApplyGaussianBlurParallel` Test Case
clear();

vInputImageSize = [720, 1280, 1920, 2560];
numIterations   = 20;

vRunTimeParallelGaussianBlur    = zeros(numIterations, length(vInputImageSize));
vRunTimeSerialGaussianBlur      = zeros(numIterations, length(vInputImageSize));

gaussianKernelStd   = 10;
stdToRadiusFactor   = 3.5;
numThreads          = 4;

for iImageSizeIdx = 1:length(vInputImageSize);
    imageSize = vInputImageSize(iImageSizeIdx);

    mInputImage = randn(imageSize, 'single');

    maxNumCompThreads(1);

    for iIter = 1:numIterations
        hTimeStart = tic();
        mBlurredImage1 = ApplyGaussianBlur(mInputImage, gaussianKernelStd, stdToRadiusFactor);
        vRunTimeSerialGaussianBlur(iIter, iImageSizeIdx) = toc(hTimeStart);
    end

    maxNumCompThreads(numThreads);

    for iIter = 1:numIterations
        hTimeStart = tic();
        mBlurredImage1 = ApplyGaussianBlurParallel(mInputImage, gaussianKernelStd, stdToRadiusFactor, numThreads);
        vRunTimeParallelGaussianBlur(iIter, iImageSizeIdx) = toc(hTimeStart);
    end

end

vRunTimeParallelGaussianBlurMean    = mean(vRunTimeParallelGaussianBlur);
vRunTimeParallelGaussianBlurStd     = std(vRunTimeParallelGaussianBlur);
vRunTimeParallelGaussianBlurMedian  = median(vRunTimeParallelGaussianBlur);

vRunTimeSerialGaussianBlurMean    = mean(vRunTimeSerialGaussianBlur);
vRunTimeSerialGaussianBlurStd     = std(vRunTimeSerialGaussianBlur);
vRunTimeSerialGaussianBlurMedian  = median(vRunTimeSerialGaussianBlur);

figure();
plot(vInputImageSize, [vRunTimeParallelGaussianBlurMean(:), vRunTimeSerialGaussianBlurMean(:)], ...
    'LineStyle', 'none', 'Marker', 'o');
title('Mean Runtime');
legend({['Parallel'], ['Serial']});

figure();
plot(vInputImageSize, [vRunTimeParallelGaussianBlurMedian(:), vRunTimeSerialGaussianBlurMedian(:)], ...
    'LineStyle', 'none', 'Marker', 'o');
title('Median Runtime');
legend({['Parallel'], ['Serial']});

但我得到的是:

Mean of Run Time Median of Run Time

即,我不能使它足够有效 任何人都可以考虑更好的更有效的方法或做得更好吗?

谢谢。

1 个答案:

答案 0 :(得分:2)

在某些时候,您将matlab过程中的线程数与并行计算工具箱正在使用的计算工作者数量混合在一起。

maxNumCompThreads设置允许每个matlab进程使用的线程数。这与并行计算工具箱无关。

parpoolmatlabpool设置处理通过并行计算工具箱函数(如parfor)生成的作业的工作人员(单个进程)数。

ApplyGaussianBlurParallel需要工人数量,而不是您当前传递的线程数。

解决这个问题我的结果略胜一筹,但并行计算仍然较慢。我完全删除了maxNumCompThreads,没有在这里看到使用它的理由。

最有效的方法可能就是使用你的GPU:

function [ mBlurredImage ] = ApplyGaussianBlur( mInputImage, 

gaussianKernelStd, stdToRadiusFactor )

gaussianBlurRadius  = ceil(stdToRadiusFactor * gaussianKernelStd); % Imitating Photoshop - See Reference

vGaussianKernel = exp(-([-gaussianBlurRadius:gaussianBlurRadius] .^ 2) / (2 * gaussianKernelStd * gaussianKernelStd));
vGaussianKernel = vGaussianKernel / sum(vGaussianKernel);

mInputImagePadded   = padarray(mInputImage, [gaussianBlurRadius, gaussianBlurRadius], 'replicate', 'both');
GvGaussianKernel=gpuArray(vGaussianKernel);
GmInputImagePadded=gpuArray(mInputImagePadded);
mBlurredImage = conv2(GvGaussianKernel, GvGaussianKernel.', GmInputImagePadded, 'valid');


end

相同的基准测试(Core i5-4690 4x 3500MHz / GT730): enter image description here