使用CUDA的平均过滤器

时间:2016-02-13 11:14:27

标签: cuda

我打算使用CUDA构建图像处理。为了表示图像,我使用矩阵(随机生成值)。我想对这个矩阵应用平均滤波器。我使用的过滤器大小是3.这是我写的代码。当数字(N = 10)小于块尺寸大小(BLOCK_DIM = 32)时,这可以正常工作。我试过N = 5和BLOCK_DIM = 3.它工作正常。

为什么当BLOCK_DIM增加时,此代码会导致意外结果(0而不是平均值),我该如何解决?

#include <stdio.h>
#include <stdlib.h>

#define N 10
#define BLOCK_DIM 32

__global__ void averageKernel (int *a, int *c) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    int index = col + row * N;

    c[index] = 1;
    int sum = 0;
    int avg = 0;
    if (row > 0 && col > 0 && col < N-1 && row < N-1 ) {  
        sum = sum + a[index - 1];
        sum = sum + a[index + 1];
        sum = sum + a[index - N-1];                
        sum = sum + a[index - N];                  
        sum = sum + a[index - N+1];                
        sum = sum + a[index + N-1];                
        sum = sum + a[index + N];                  
        sum = sum + a[index + N+1];                
        sum = sum + a[index];                      
        avg = sum/9;                            
    }
        c[index] = avg;

}

void printMatrix(int a[N][N] )
{
    for(int i=0; i<N; i++){
        for (int j=0; j<N; j++){
            printf("%d\t", a[i][j] );
        }
        printf("\n");
    }
}

int main() {
    int a[N][N], c[N][N];
    int *dev_a, *dev_c;

    int size = N * N * sizeof(int);

    for(int i=0; i<N; i++)
        for (int j=0; j<N; j++){
            a[i][j] = rand() % 256;
        }

    printf("Matrix A\n");
    printMatrix(a);

    cudaMalloc((void**)&dev_a, size);
    cudaMalloc((void**)&dev_c, size);

    cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);

    dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
    dim3 dimGrid((N+dimBlock.x-1)/dimBlock.x, (N+dimBlock.y-1)/dimBlock.y);

    printf("dimGrid.x = %d, dimGrid.y = %d\n", dimGrid.x, dimGrid.y);

    averageKernel<<<dimGrid,dimBlock>>>(dev_a,dev_c);
    cudaDeviceSynchronize();
    cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);

    printf("Matrix c\n");
    printMatrix(c);

    cudaFree(dev_a);
    cudaFree(dev_c);
}

1 个答案:

答案 0 :(得分:2)

您收到“意外结果”,因为您的内核失败并且超出内存访问权限。如果您在代码中添加了error checking和/或使用了cuda-memcheck,那么您就已经知道了。

问题的根源是这两行:

c[index] = 1;

....

c[index] = avg;

是无条件执行的,当你运行的线程数超过输出矩阵的大小时,它将产生超出内存访问。如果修改内核以便只对输出矩阵范围内的线程执行,那么问题就会消失。