我创建了以下(MATLAB)函数,以在图像上应用高斯滤镜模糊:
function [ mBlurredImage ] = ApplyGaussianBlur( mInputImage, gaussianKernelStd, stdToRadiusFactor )
gaussianBlurRadius = ceil(stdToRadiusFactor * gaussianKernelStd); % Imitating Photoshop - See Reference
vGaussianKernel = exp(-([-gaussianBlurRadius:gaussianBlurRadius] .^ 2) / (2 * gaussianKernelStd * gaussianKernelStd));
vGaussianKernel = vGaussianKernel / sum(vGaussianKernel);
mInputImagePadded = padarray(mInputImage, [gaussianBlurRadius, gaussianBlurRadius], 'replicate', 'both');
mBlurredImage = conv2(vGaussianKernel, vGaussianKernel.', mInputImagePadded, 'valid');
end
我正在尝试找到创建它的并行版本的最佳方法 我想找到一个适用于OpenMP的方法/策略。
我尝试填充图像,然后将其分成4个部分并在每个部分上应用模糊 然后我收集了所有碎片 这是代码:
function [ mBlurredImage ] = ApplyGaussianBlurParallel( mInputImage, gaussianKernelStd, stdToRadiusFactor, numThreads )
numRows = size(mInputImage, 1);
numCols = size(mInputImage, 2);
% mBlurredImage = zeros(numRows, numCols);
gaussianKernelRadius = ceil(stdToRadiusFactor * gaussianKernelStd); % Imitating Photoshop - See Reference
vGaussianKernel = exp(-([-gaussianKernelRadius:gaussianKernelRadius] .^ 2) / (2 * gaussianKernelStd * gaussianKernelStd));
vGaussianKernel = vGaussianKernel / sum(vGaussianKernel);
numRowsPadded = numRows + (2 * gaussianKernelRadius);
numColsPadded = numCols + (2 * gaussianKernelRadius);
mInputImagePadded = padarray(mInputImage, [gaussianKernelRadius, gaussianKernelRadius], 'replicate', 'both');
vColIdxImageBlock = round(linspace(1, numCols, (numThreads + 1)));
vFirstColIdxImageBlock = vColIdxImageBlock(1:numThreads);
% Going form Image Axis to Padded Image Axis
vFirstColIdxImagePaddedBlock = vFirstColIdxImageBlock + gaussianKernelRadius;
% Adding Pixels to the left
vFirstColIdxImagePaddedBlock = vFirstColIdxImagePaddedBlock - gaussianKernelRadius;
vLastColIdxImageBlock = vColIdxImageBlock(2:(numThreads + 1));
% Going form Image Axis to Padded Image Axis
vLastColIdxImagePaddedBlock = vLastColIdxImageBlock + gaussianKernelRadius;
% Adding Pixels to the right
vLastColIdxImagePaddedBlock = vLastColIdxImagePaddedBlock + gaussianKernelRadius;
vRowsIdxImage = 1:numRows;
vRowsImagePadded = 1:numRowsPadded;
cImageBlock = cell(numThreads, 1);
cImageBlockProcessed = cell(numThreads, 1);
for iBlockIdx = 1:numThreads
firstColIdxImagePaddedBlock = vFirstColIdxImagePaddedBlock(iBlockIdx);
lastColIdxImagePaddedBlock = vLastColIdxImagePaddedBlock(iBlockIdx);
vColsIdxImagePadded = [firstColIdxImagePaddedBlock:lastColIdxImagePaddedBlock];
cImageBlock{iBlockIdx} = mInputImagePadded(vRowsImagePadded, vColsIdxImagePadded);
end
parfor iBlockIdx = 1:numThreads
cImageBlockProcessed{iBlockIdx} = conv2(vGaussianKernel, vGaussianKernel.', cImageBlock{iBlockIdx}, 'valid');
end
mBlurredImage = zeros(numRows, numCols);
for iBlockIdx = 1:numThreads
firstColIdxImageBlock = vFirstColIdxImageBlock(iBlockIdx);
lastColIdxImageBlock = vLastColIdxImageBlock(iBlockIdx);
vColsIdxImage = [firstColIdxImageBlock:lastColIdxImageBlock];
mBlurredImage(vRowsIdxImage, vColsIdxImage) = cImageBlockProcessed{iBlockIdx};
end
end
我还创建了以下脚本来分析性能:
% `ApplyGaussianBlurParallel` Test Case
clear();
vInputImageSize = [720, 1280, 1920, 2560];
numIterations = 20;
vRunTimeParallelGaussianBlur = zeros(numIterations, length(vInputImageSize));
vRunTimeSerialGaussianBlur = zeros(numIterations, length(vInputImageSize));
gaussianKernelStd = 10;
stdToRadiusFactor = 3.5;
numThreads = 4;
for iImageSizeIdx = 1:length(vInputImageSize);
imageSize = vInputImageSize(iImageSizeIdx);
mInputImage = randn(imageSize, 'single');
maxNumCompThreads(1);
for iIter = 1:numIterations
hTimeStart = tic();
mBlurredImage1 = ApplyGaussianBlur(mInputImage, gaussianKernelStd, stdToRadiusFactor);
vRunTimeSerialGaussianBlur(iIter, iImageSizeIdx) = toc(hTimeStart);
end
maxNumCompThreads(numThreads);
for iIter = 1:numIterations
hTimeStart = tic();
mBlurredImage1 = ApplyGaussianBlurParallel(mInputImage, gaussianKernelStd, stdToRadiusFactor, numThreads);
vRunTimeParallelGaussianBlur(iIter, iImageSizeIdx) = toc(hTimeStart);
end
end
vRunTimeParallelGaussianBlurMean = mean(vRunTimeParallelGaussianBlur);
vRunTimeParallelGaussianBlurStd = std(vRunTimeParallelGaussianBlur);
vRunTimeParallelGaussianBlurMedian = median(vRunTimeParallelGaussianBlur);
vRunTimeSerialGaussianBlurMean = mean(vRunTimeSerialGaussianBlur);
vRunTimeSerialGaussianBlurStd = std(vRunTimeSerialGaussianBlur);
vRunTimeSerialGaussianBlurMedian = median(vRunTimeSerialGaussianBlur);
figure();
plot(vInputImageSize, [vRunTimeParallelGaussianBlurMean(:), vRunTimeSerialGaussianBlurMean(:)], ...
'LineStyle', 'none', 'Marker', 'o');
title('Mean Runtime');
legend({['Parallel'], ['Serial']});
figure();
plot(vInputImageSize, [vRunTimeParallelGaussianBlurMedian(:), vRunTimeSerialGaussianBlurMedian(:)], ...
'LineStyle', 'none', 'Marker', 'o');
title('Median Runtime');
legend({['Parallel'], ['Serial']});
但我得到的是:
即,我不能使它足够有效 任何人都可以考虑更好的更有效的方法或做得更好吗?
谢谢。
答案 0 :(得分:2)
在某些时候,您将matlab过程中的线程数与并行计算工具箱正在使用的计算工作者数量混合在一起。
maxNumCompThreads
设置允许每个matlab进程使用的线程数。这与并行计算工具箱无关。
parpool
或matlabpool
设置处理通过并行计算工具箱函数(如parfor
)生成的作业的工作人员(单个进程)数。
ApplyGaussianBlurParallel
需要工人数量,而不是您当前传递的线程数。
解决这个问题我的结果略胜一筹,但并行计算仍然较慢。我完全删除了maxNumCompThreads
,没有在这里看到使用它的理由。
最有效的方法可能就是使用你的GPU:
function [ mBlurredImage ] = ApplyGaussianBlur( mInputImage,
gaussianKernelStd, stdToRadiusFactor )
gaussianBlurRadius = ceil(stdToRadiusFactor * gaussianKernelStd); % Imitating Photoshop - See Reference
vGaussianKernel = exp(-([-gaussianBlurRadius:gaussianBlurRadius] .^ 2) / (2 * gaussianKernelStd * gaussianKernelStd));
vGaussianKernel = vGaussianKernel / sum(vGaussianKernel);
mInputImagePadded = padarray(mInputImage, [gaussianBlurRadius, gaussianBlurRadius], 'replicate', 'both');
GvGaussianKernel=gpuArray(vGaussianKernel);
GmInputImagePadded=gpuArray(mInputImagePadded);
mBlurredImage = conv2(GvGaussianKernel, GvGaussianKernel.', GmInputImagePadded, 'valid');
end
相同的基准测试(Core i5-4690 4x 3500MHz / GT730):