CUDA C ++代码在MATLAB中引发错误

时间:2017-05-26 06:15:11

标签: matlab cuda

我的目标是将一组用MATLAB编写的代码转换为CUDA C ++,以便在GPU上进行并行处理。 这是我试图改造的MATLAB代码:

function [M] = iqm_czekanowski(img1, img2)
img1 = double(img1);
img2 = double(img2);
[R,C,K] = size(img1);
N2 = R*C;
SUM1 = zeros(R,C);
SUM2 = zeros(R,C);
MIN = min(img1,img2);
%display(size(MIN));
SUM = img1 + img2;
for k=1:K
    SUM1 = SUM1 + MIN(:,:,k);
    SUM2 = SUM2 + SUM(:,:,k);    
end
SUM = (2 .* SUM1) ./ SUM2;
SUM(isnan(SUM))=0;
SUM = 1 - (SUM);
M = sum(sum(SUM)) / N2;

img1和img2是从另一个脚本作为参数传递的2个rgb图像。为简化起见,我将rgb图像分成3个不同的通道img_r,img_b和img_g,分别代表r,b和g平面。 这是引发错误的CUDA C ++代码:

__global__ void iqm(int *img_r, int *img_g, int *img_b, int *f_img_r, int *f_img_g, int *f_img_b, int *x, int *y, int *z, double *iqm_res){
int n = x[0] * y[0];

//mae and mse
double mae = 0, m1 = 0, m2 = 0, m3 = 0;
double mse = 0, mse1 = 0, mse2 = 0, mse3 = 0;
for (int i = 0; i < n; ++i){
    m1 = m1 + abs(img_r[i] - f_img_r[i]);
    mse1 = mse1 + pow((double)abs(img_r[i] - f_img_r[i]),2.0);
}
m1 = m1 / n;
mse1 = sqrt(mse1 / n);
for (int i = 0; i < n; ++i){
    m2 = m2 + abs(img_g[i] - f_img_g[i]);
    mse2 = mse2 + pow((double)abs(img_g[i] - f_img_g[i]),2.0);
}
m2 = m2 / n;
mse2 = sqrt(mse2 / n);
for (int i = 0; i < n; ++i){
    m3 = m3 + abs(img_b[i] - f_img_b[i]);
    mse3 = mse3 + pow((double)abs(img_b[i] - f_img_b[i]),2.0);
}
m3 = m3 / n;
mse3 = sqrt(mse3 / n);
mae = (m1 + m2 + m3) / z[0];
mse = (mse1 + mse2 + mse3) / z[0];

//iqm_res[0] = mae;
//iqm_res[1] = mse;

//czekanowski
int min_r[26730], min_g[26730], min_b[26730];
int sum_r[26730], sum_g[26730], sum_b[26730];
for (int i = 0; i < n; ++i){
    if (img_r[i] <= f_img_r[i]){
        min_r[i] = img_r[i];
    }
    else{
        min_r[i] = f_img_r[i];
    }
    sum_r[i] = img_r[i] + f_img_r[i];

    if (img_g[i] <= f_img_g[i]){
        min_g[i] = img_g[i];
    }
    else{
        min_g[i] = f_img_g[i];
    }
    sum_g[i] = img_g[i] + f_img_g[i];

    if (img_b[i] <= f_img_b[i]){
        min_b[i] = img_b[i];
    }
    else{
        min_b[i] = f_img_b[i];
    }
    sum_b[i] = img_b[i] + f_img_b[i];
}

int sum1[26730], sum2[26730];
for (int i = 0; i < n; ++i){
    sum1[i] = min_r[i] + min_g[i] + min_b[i];
    sum2[i] = sum_r[i] + sum_g[i] + sum_b[i];
}
double sum[26730];
for (int i = 0; i < n; ++i){
    if (sum2[i] == 0){
        sum[i] = 1.0;
    }
    else{
        sum[i] = 1 - (2 * sum1[i] / sum2[i]);

    }
}
double czekanowski = 0;
for (int i = 0; i < n; ++i){
    czekanowski += sum[i];
}
czekanowski /= (double)n;
//printf("%f",czekanowski);
iqm_res[0] = mae;
iqm_res[1] = mse;
iqm_res[2] = czekanowski;
}

前三个参数表示第一个图像的r,g,b通道,接下来的三个参数表示第二个图像的相同。代码的最后一行

    iqm_res[2] = czekanowski;

是导致错误的那个。 这是我在评论最后一行后得到的结果

iqm =

1.0595    1.9781    0.0065    0.9972    0.9995    0.2892    3.9219    1.3211

iqm_res =

1.0595    1.9781         0         0         0         0         0         0         0         0

以及我取消注释后出现的错误:

使用parallel.gpu.CUDAKernel / feval时出错 尝试启动内核时发生意外错误。 CUDA错误是: CUDA_ERROR_INVALID_VALUE

iqm_main_demo出错(第59行) [t1,t2,t3,t4,t5,t6,t7,t8,t9,iqm_res] = feval(k,img1_r,img1_g,img1_b,img2_r,img2_g,img2_b,x,y,z,iqm_res);

mse和mae部分工作正常,并给出了正确的结果。 另外我想问一下,如果可以用于上述图像的大小有任何限制。我拍了1500x1200的大图像,导致硬件错误。

编辑:整个代码包括内核调用。

%iqm_main_demo

clear all;

img_sample=imread('onion.png');
gfilt = fspecial('gaussian');
filt_img = imfilter(img_sample, gfilt, 'replicate');
cnt=0;

iqm(cnt+1) = iqm_mae(img_sample, filt_img);
iqm(cnt+2) = iqm_mse(img_sample, filt_img);
iqm(cnt+3) = iqm_czekanowski(img_sample, filt_img);
iqm(cnt+4) = iqm_crosscorr(img_sample, filt_img);
iqm(cnt+5) = iqm_normcrosscorr(img_sample, filt_img);
iqm(cnt+6) = iqm_mas(img_sample, filt_img);
iqm(cnt+7) = iqm_spectralmagnitude(img_sample, filt_img);
iqm(cnt+8) = iqm_spectralphase(img_sample, filt_img);
%iqm(cnt+9) = iqm_hvs(img_sample, filt_img);
%iqm(cnt+10) = iqm_laplacianmse(img_sample, filt_img);
%cnt = cnt + 10;

iqm

k = parallel.gpu.CUDAKernel('demo.ptx','demo.cu');
k.ThreadBlockSize = [1 1 1];

img1_r = img_sample(:,:,1);
img1_g = img_sample(:,:,2);
img1_b = img_sample(:,:,3);

img2_r = filt_img(:,:,1);
img2_g = filt_img(:,:,2);
img2_b = filt_img(:,:,3);

[x,y,z]=size(img_sample);

img1_r = reshape(img1_r',[1 x*y]);
img1_g = reshape(img1_g',[1 x*y]);
img1_b = reshape(img1_b',[1 x*y]);

img2_r = reshape(img2_r',[1 x*y]);
img2_g = reshape(img2_g',[1 x*y]);
img2_b = reshape(img2_b',[1 x*y]);

img1_r = gpuArray(int32(img1_r));
img1_g = gpuArray(int32(img1_g));
img1_b = gpuArray(int32(img1_b));

img2_r = gpuArray(int32(img2_r));
img2_g = gpuArray(int32(img2_g));
img2_b = gpuArray(int32(img2_b));

x = gpuArray(int32(x));
y = gpuArray(int32(y));
z = gpuArray(int32(z));

iqm_res = gpuArray(zeros(1,10)); 

[t1,t2,t3,t4,t5,t6,t7,t8,t9,iqm_res] = 
feval(k,img1_r,img1_g,img1_b,img2_r,img2_g,img2_b,x,y,z,iqm_res);

iqm_res

2 个答案:

答案 0 :(得分:1)

您出于某种原因指定了1x1x1的块尺寸,并且您说您的输入为1500x1200,需要180万个块。我不确定这是可能的,所以也许这就是原因。尝试评论该行

k.ThreadBlockSize = [1 1 1];

并查看它是否开始工作。

答案 1 :(得分:0)

我弄清楚出了什么问题。原来gpu上的一个线程有限的本地内存来存储它的变量副本(可能大概是64K)。声明所有辅助阵列导致了问题。直接计算所有内容而没有任何辅助值可以解决它。

另外,我一直在以错误的方式使用CUDA。我假设我可以让每个线程完全处理一个图像,从而通过一次处理多个图像来节省连续处理图像的时间。为了在单个图像上测试我的代码,我采用了ThreadBlockSize = [1 1 1]。在我解决了辅助阵列问题之后,由于Windows强制执行2秒的内核执行超时,代码仍然无效。更改注册表中的超时值可修复问题。但是,现在以不同的方式实现了这个问题,不会导致任何内存溢出或超时。