使用CUDA C的二维矩阵的积分图像或总面积表

时间:2014-03-06 07:14:19

标签: c image-processing cuda gpgpu gpu-programming

我正在尝试计算行数和列数不相等的2D矩阵的 Summed Area Table 。我遇到了一个小问题,我的代码似乎在行和列相等的情况下运行正常,但是当行和列不相等时,它无法计算最终输出的最后一行。问题是我无法弄清楚为什么会这样。

积分图像/求和面积表的基本算法:

基本上,在积分和中,每个像素或索引元素计算其上方和后方的所有矩阵元素的总和。例如,对于具有以下元素的3x2输入数组:

 [5, 2|
 |5, 2|  
 |5, 2] 

输出数组中的积分和将为:

 [5,   7|
 |10, 14|  
 |15, 21] 

基本上以下是我在CUDA C中尝试做的事情:

for(int matrixElement_y_index=0; matrixElement_y_index<=total_rows-1; matrixElement_y_index++)
{
    //matrixElement_x_index and matrixElement_y_index represent (x,y) indices of each matrix element
    for(int matrixElement_x_index=0; matrixElement_x_index<=total_columns-1; matrixElement_x_index++)
    {
        int temp=0; 

        for(int r=0;r<=(matrixElement_y_index);r++)
        {
            for(int c=0; c<=matrixElement_x_index;c++)
            {
                temp=temp+input[c][r];
            }
        }

        output[matrixElement_y_index][matrixElement_x_index]=temp;
    }
}

我到目前为止提出的CUDA C代码如下:

#include <iostream>
#include <cuda_runtime.h>

using namespace std;

__global__ void image_integral(int *a, int*b, int width_x,int width_y)
{
    // Thread Ids equal to block Ids because the each blocks contains one thread only.
    int gidx = blockIdx.x;
    int gidy = blockIdx.y;
    int temp=0;

    if(gidx>=width_x || gidy>=width_y)
    {
    //Return the threads which exceed the input array's X or Y dimension.
        return;
    }

    else
    //Compute the Integral Image or Summed Area Table
    {   
        // The first loop iterates from zero to the Y index of the thread which represents the corresponding element of the output/input array.  
        for(int counter=0;counter<=gidy;counter++)
        {
            // The first loop iterates from zero to the X index of the thread which represents the corresponding element of the output/input array  
            for(int counter_two=0; counter_two<=gidx; counter_two++)
            {
                temp = temp+a[counter*width_x+counter_two];
            }
        }
    }

    //Transfer the final result to the output array
    b[gidy*width_x+gidx]=temp;
}

void main()
{
    //M is number of rows
    //N is number of columns

    int M=3,N=2, m_e=0;
    int total_e=M*N;
    int widthstep=total_e*sizeof(int);

    int * matrix_a= (int *)malloc(widthstep);
    int * matrix_b= (int *)malloc(widthstep);

    cout<<"Enter elements for "<< M<<"x"<<N<<" matrix";

    for(int r=0;r<=M-1;r++)
    {
        for(int c=0; c<=N-1;c++)
        {
            cout<<"Enter Matrix element [ "<<c<<","<<r<<"]";
            cin>>m_e;
            matrix_a[r*M+c]=m_e;
            matrix_b[r*M+c]=0;
        }
    }

    int * d_matrix_a, * d_matrix_b;

    cout<<"Input:"<<endl;

    for(int kk=0;kk<=M-1;kk++)
    {
        for(int jj=0;jj<=N-1;jj++){
            cout<<matrix_a[kk*M+jj]<<" ";}
        cout<<endl;
    }

    cout<<endl;

    cudaMalloc(&d_matrix_a,widthstep);
    cudaMalloc(&d_matrix_b,widthstep);

    cudaMemcpy(d_matrix_a,matrix_a,widthstep,cudaMemcpyHostToDevice);
    cudaMemcpy(d_matrix_b,matrix_b,widthstep,cudaMemcpyHostToDevice);

    //Creating a grid where the number of blocks are equal to the number of pixels or input matrix elements.

    //Each block contains only one thread.

    dim3 grid(M,N); 

    image_integral<<<grid,1>>>(d_matrix_a, d_matrix_b,M,N);

    cudaThreadSynchronize();

    cudaMemcpy(matrix_b,d_matrix_b,widthstep,cudaMemcpyDeviceToHost);

    cout<<"The Summed Area table is: "<<endl;

    for(int kk=0;kk<=M-1;kk++)
    {
        for(int jj=0;jj<=N-1;jj++)
            cout<<matrix_b[kk*M+jj]<<" ";
        cout<<endl;
    }

    system("pause");

    cudaFree(d_matrix_a);
    cudaFree(d_matrix_b);
    free(matrix_a);
    free(matrix_b);
}

非常感谢!!

1 个答案:

答案 0 :(得分:5)

您的主要问题是内存使用和存储错误。使用你的代码你也破坏了堆! 我通过使用行主要排序来改变你的代码,因为它通常用在c / c ++中。

将输入写入主机内存matrix_a[r*M+c]时,会出现第一个错误。由于r范围来自0..M(3)且c范围来自0..N(2),因此最大索引为2*3+1=7。但你的矩阵只有6个元素 - 最大指数是5!因此,我改变了所有矩阵访问。

通过这些更改,我也必须适合您的网格设置。现在是dim3 grid(N,M);

如果您不确定变量代表什么或如何使用它,请使用良好的代表名称,就像在c参考代码中所做的那样!

随着这一点的改变你的代码对我有用。请注意,矩阵的输入方式也已改变了!

更改完整代码之上: 内核函数:

__global__ void image_integral(int *a, int*b, int rowsTotal,int colsTotal)
{
    // Thread Ids equal to block Ids because the each blocks contains one thread only.
    int col = blockIdx.x;
    int row = blockIdx.y;
    int temp=0;

    if(col < colsTotal && row < rowsTotal)
    {
        // The first loop iterates from zero to the Y index of the thread which represents the corresponding element of the output/input array.  
        for(int r=0;r<=row;r++)
        {
            // The second loop iterates from zero to the X index of the thread which represents the corresponding element of the output/input array  
            for(int c=0; c<=col; c++)
            {
                temp = temp+a[r*colsTotal+c];
            }
        }
    }

    //Transfer the final result to the output array
    b[row*colsTotal+col]=temp;
}

主持人实施:

void main()
{
    //M is number of rows
    //N is number of columns

    int M=3,N=2, m_e=0;
    int total_e=M*N;
    int widthstep=total_e*sizeof(int);

    int * matrix_a= (int *)malloc(widthstep);
    int * matrix_b= (int *)malloc(widthstep);

    cout<<"Enter elements for "<< M<<"x"<<N<<" matrix";

    for(int r=0;r<M;r++)
    {
        for(int c=0; c<N;c++)
        {
            cout<<"Enter Matrix element [ "<<r<<","<<c<<"]";
            cin>>m_e;
            matrix_a[r*N+c]=m_e;
            matrix_b[r*N+c]=0;
        }
    }

    int * d_matrix_a, * d_matrix_b;

    cout<<"Input:"<<endl;

    for(int r=0;r<M;r++)
    {
        for(int c=0; c<N;c++)
        {
            cout << matrix_a[r*N+c]<<" ";
        }
        cout << endl;
    }

    cout<<endl;

    cudaMalloc(&d_matrix_a,widthstep);
    cudaMalloc(&d_matrix_b,widthstep);

    cudaMemcpy(d_matrix_a,matrix_a,widthstep,cudaMemcpyHostToDevice);
    cudaMemcpy(d_matrix_b,matrix_b,widthstep,cudaMemcpyHostToDevice);

    //Creating a grid where the number of blocks are equal to the number of pixels or input matrix elements.

    //Each block contains only one thread.

    dim3 grid(N,M);

    image_integral<<<grid,1>>>(d_matrix_a, d_matrix_b,M,N);

    cudaThreadSynchronize();

    cudaMemcpy(matrix_b,d_matrix_b,widthstep,cudaMemcpyDeviceToHost);

    cout<<"The Summed Area table is: "<<endl;

    for(int r=0;r<M;r++)
    {
        for(int c=0; c<N;c++)
        {
            cout << matrix_b[r*N+c]<<" ";
        }
        cout << endl;
    }

    system("pause");

    cudaFree(d_matrix_a);
    cudaFree(d_matrix_b);
    free(matrix_a);
    free(matrix_b);
}