我的目标是将一组用MATLAB编写的代码转换为CUDA C ++,以便在GPU上进行并行处理。 这是我试图改造的MATLAB代码:
function [M] = iqm_czekanowski(img1, img2)
img1 = double(img1);
img2 = double(img2);
[R,C,K] = size(img1);
N2 = R*C;
SUM1 = zeros(R,C);
SUM2 = zeros(R,C);
MIN = min(img1,img2);
%display(size(MIN));
SUM = img1 + img2;
for k=1:K
SUM1 = SUM1 + MIN(:,:,k);
SUM2 = SUM2 + SUM(:,:,k);
end
SUM = (2 .* SUM1) ./ SUM2;
SUM(isnan(SUM))=0;
SUM = 1 - (SUM);
M = sum(sum(SUM)) / N2;
img1和img2是从另一个脚本作为参数传递的2个rgb图像。为简化起见,我将rgb图像分成3个不同的通道img_r,img_b和img_g,分别代表r,b和g平面。 这是引发错误的CUDA C ++代码:
__global__ void iqm(int *img_r, int *img_g, int *img_b, int *f_img_r, int *f_img_g, int *f_img_b, int *x, int *y, int *z, double *iqm_res){
int n = x[0] * y[0];
//mae and mse
double mae = 0, m1 = 0, m2 = 0, m3 = 0;
double mse = 0, mse1 = 0, mse2 = 0, mse3 = 0;
for (int i = 0; i < n; ++i){
m1 = m1 + abs(img_r[i] - f_img_r[i]);
mse1 = mse1 + pow((double)abs(img_r[i] - f_img_r[i]),2.0);
}
m1 = m1 / n;
mse1 = sqrt(mse1 / n);
for (int i = 0; i < n; ++i){
m2 = m2 + abs(img_g[i] - f_img_g[i]);
mse2 = mse2 + pow((double)abs(img_g[i] - f_img_g[i]),2.0);
}
m2 = m2 / n;
mse2 = sqrt(mse2 / n);
for (int i = 0; i < n; ++i){
m3 = m3 + abs(img_b[i] - f_img_b[i]);
mse3 = mse3 + pow((double)abs(img_b[i] - f_img_b[i]),2.0);
}
m3 = m3 / n;
mse3 = sqrt(mse3 / n);
mae = (m1 + m2 + m3) / z[0];
mse = (mse1 + mse2 + mse3) / z[0];
//iqm_res[0] = mae;
//iqm_res[1] = mse;
//czekanowski
int min_r[26730], min_g[26730], min_b[26730];
int sum_r[26730], sum_g[26730], sum_b[26730];
for (int i = 0; i < n; ++i){
if (img_r[i] <= f_img_r[i]){
min_r[i] = img_r[i];
}
else{
min_r[i] = f_img_r[i];
}
sum_r[i] = img_r[i] + f_img_r[i];
if (img_g[i] <= f_img_g[i]){
min_g[i] = img_g[i];
}
else{
min_g[i] = f_img_g[i];
}
sum_g[i] = img_g[i] + f_img_g[i];
if (img_b[i] <= f_img_b[i]){
min_b[i] = img_b[i];
}
else{
min_b[i] = f_img_b[i];
}
sum_b[i] = img_b[i] + f_img_b[i];
}
int sum1[26730], sum2[26730];
for (int i = 0; i < n; ++i){
sum1[i] = min_r[i] + min_g[i] + min_b[i];
sum2[i] = sum_r[i] + sum_g[i] + sum_b[i];
}
double sum[26730];
for (int i = 0; i < n; ++i){
if (sum2[i] == 0){
sum[i] = 1.0;
}
else{
sum[i] = 1 - (2 * sum1[i] / sum2[i]);
}
}
double czekanowski = 0;
for (int i = 0; i < n; ++i){
czekanowski += sum[i];
}
czekanowski /= (double)n;
//printf("%f",czekanowski);
iqm_res[0] = mae;
iqm_res[1] = mse;
iqm_res[2] = czekanowski;
}
前三个参数表示第一个图像的r,g,b通道,接下来的三个参数表示第二个图像的相同。代码的最后一行
iqm_res[2] = czekanowski;
是导致错误的那个。 这是我在评论最后一行后得到的结果
iqm =
1.0595 1.9781 0.0065 0.9972 0.9995 0.2892 3.9219 1.3211
iqm_res =
1.0595 1.9781 0 0 0 0 0 0 0 0
以及我取消注释后出现的错误:
使用parallel.gpu.CUDAKernel / feval时出错 尝试启动内核时发生意外错误。 CUDA错误是: CUDA_ERROR_INVALID_VALUE
iqm_main_demo出错(第59行) [t1,t2,t3,t4,t5,t6,t7,t8,t9,iqm_res] = feval(k,img1_r,img1_g,img1_b,img2_r,img2_g,img2_b,x,y,z,iqm_res);
mse和mae部分工作正常,并给出了正确的结果。 另外我想问一下,如果可以用于上述图像的大小有任何限制。我拍了1500x1200的大图像,导致硬件错误。
编辑:整个代码包括内核调用。
%iqm_main_demo
clear all;
img_sample=imread('onion.png');
gfilt = fspecial('gaussian');
filt_img = imfilter(img_sample, gfilt, 'replicate');
cnt=0;
iqm(cnt+1) = iqm_mae(img_sample, filt_img);
iqm(cnt+2) = iqm_mse(img_sample, filt_img);
iqm(cnt+3) = iqm_czekanowski(img_sample, filt_img);
iqm(cnt+4) = iqm_crosscorr(img_sample, filt_img);
iqm(cnt+5) = iqm_normcrosscorr(img_sample, filt_img);
iqm(cnt+6) = iqm_mas(img_sample, filt_img);
iqm(cnt+7) = iqm_spectralmagnitude(img_sample, filt_img);
iqm(cnt+8) = iqm_spectralphase(img_sample, filt_img);
%iqm(cnt+9) = iqm_hvs(img_sample, filt_img);
%iqm(cnt+10) = iqm_laplacianmse(img_sample, filt_img);
%cnt = cnt + 10;
iqm
k = parallel.gpu.CUDAKernel('demo.ptx','demo.cu');
k.ThreadBlockSize = [1 1 1];
img1_r = img_sample(:,:,1);
img1_g = img_sample(:,:,2);
img1_b = img_sample(:,:,3);
img2_r = filt_img(:,:,1);
img2_g = filt_img(:,:,2);
img2_b = filt_img(:,:,3);
[x,y,z]=size(img_sample);
img1_r = reshape(img1_r',[1 x*y]);
img1_g = reshape(img1_g',[1 x*y]);
img1_b = reshape(img1_b',[1 x*y]);
img2_r = reshape(img2_r',[1 x*y]);
img2_g = reshape(img2_g',[1 x*y]);
img2_b = reshape(img2_b',[1 x*y]);
img1_r = gpuArray(int32(img1_r));
img1_g = gpuArray(int32(img1_g));
img1_b = gpuArray(int32(img1_b));
img2_r = gpuArray(int32(img2_r));
img2_g = gpuArray(int32(img2_g));
img2_b = gpuArray(int32(img2_b));
x = gpuArray(int32(x));
y = gpuArray(int32(y));
z = gpuArray(int32(z));
iqm_res = gpuArray(zeros(1,10));
[t1,t2,t3,t4,t5,t6,t7,t8,t9,iqm_res] =
feval(k,img1_r,img1_g,img1_b,img2_r,img2_g,img2_b,x,y,z,iqm_res);
iqm_res
答案 0 :(得分:1)
您出于某种原因指定了1x1x1的块尺寸,并且您说您的输入为1500x1200,需要180万个块。我不确定这是可能的,所以也许这就是原因。尝试评论该行
k.ThreadBlockSize = [1 1 1];
并查看它是否开始工作。
答案 1 :(得分:0)
我弄清楚出了什么问题。原来gpu上的一个线程有限的本地内存来存储它的变量副本(可能大概是64K)。声明所有辅助阵列导致了问题。直接计算所有内容而没有任何辅助值可以解决它。
另外,我一直在以错误的方式使用CUDA。我假设我可以让每个线程完全处理一个图像,从而通过一次处理多个图像来节省连续处理图像的时间。为了在单个图像上测试我的代码,我采用了ThreadBlockSize = [1 1 1]。在我解决了辅助阵列问题之后,由于Windows强制执行2秒的内核执行超时,代码仍然无效。更改注册表中的超时值可修复问题。但是,现在以不同的方式实现了这个问题,不会导致任何内存溢出或超时。