Question

我自己学习了CUDA，并尝试使用简单的中值滤波器进行图像处理。这就是我想出来的，但我似乎无法从它出来的图像中获得任何好的结果。例如，输出图像相对没有噪声，但图像的饱和度似乎更高，当我尝试维基百科的teddy bear图片时，他的鼻子由于某种原因变绿了。考虑到任何新想法，我感到非常沮丧，所以如果有人能在代码中看到问题，我将非常感激。谢谢！

这是内核函数：

__global__ void median_filter(int *input, int *output, int IMAGE_W, int IMAGE_H){

    __shared__ float window[BLOCK_W*BLOCK_H][9];

    int x, y, tid;
    int i, j, iMin, temp;

    x = blockIdx.x*blockDim.x + threadIdx.x;
    y = blockIdx.y*blockDim.y + threadIdx.y;
    tid = threadIdx.y*blockDim.y + threadIdx.x;

    if(x>=IMAGE_W && y>=IMAGE_H)
        return;

    /* setting 3x3 window elements for median */
    if(y==0 && x==0)
        window[tid][0] = input[y*IMAGE_W+x];
    else if(y==0 && x!=0)
        window[tid][0] = input[y*IMAGE_W+x-1];
    else if(y!=0 && x==0)
        window[tid][0] = input[(y-1)*IMAGE_W+x];
    else
        window[tid][0] = input[(y-1)*IMAGE_W+x-1];

    window[tid][1] = (y==0)?input[y*IMAGE_W+x]:input[(y-1)*IMAGE_W+x];

    if(y==0 && x==IMAGE_W-1)
        window[tid][2] = input[y*IMAGE_W+x];
    else if(y!=0 && x==IMAGE_W-1)
        window[tid][2] = input[(y-1)*IMAGE_W+x];
    else if(y==0 && x!=IMAGE_W-1)
        window[tid][2] = input[(y-1)*IMAGE_W+x+1];
    else
        window[tid][2] = input[(y-1)*IMAGE_W+x+1];

    window[tid][3] = (x==0)?input[y*IMAGE_W+x]:input[y*IMAGE_W+x-1];
    window[tid][4] = input[y*IMAGE_W+x];
    window[tid][5] = (x==IMAGE_W-1)?input[y*IMAGE_W+x]:input[y*IMAGE_W+x+1];

    if(y==IMAGE_H-1 && x==0)
        window[tid][6] = input[y*IMAGE_W+x];
    else if(y!=IMAGE_H-1 && x==0)
        window[tid][6] = input[(y+1)*IMAGE_W+x];
    else if(y==IMAGE_H-1 && x!=0)
        window[tid][6] = input[y*IMAGE_W+x-1];
    else
        window[tid][6] = input[(y+1)*IMAGE_W+x-1];

    window[tid][7] = (y==IMAGE_H-1)?input[y*IMAGE_W+x]:input[(y+1)*IMAGE_W+x];

    if(y==IMAGE_H-1 && x==IMAGE_W-1)
        window[tid][8] = input[y*IMAGE_W+x];
    else if(y!=IMAGE_H-1 && x==IMAGE_W-1)
        window[tid][8] = input[(y+1)*IMAGE_W+x];
    else if(y==IMAGE_H-1 && x!=IMAGE_W-1)
        window[tid][8] = input[y*IMAGE_W+x+1];
    else
        window[tid][8] = input[(y+1)*IMAGE_W+x+1];

    __syncthreads();

    /* sorting window to find median */
    for(j=0; j<8; j++){
        iMin = j;
        for(i=j+1; i<9; i++){
            if(window[tid][i] < window[tid][iMin]){
                iMin = i;
            }
        }
        if(iMin != j){
            temp = window[tid][iMin];
            window[tid][iMin] = window[tid][j];
            window[tid][j] = temp;
        }
        __syncthreads();
    }

    output[y*IMAGE_W + x] = window[tid][4];
}

主要功能：

int main(){
    /*loading picture*/
    char picture[50] = "before.bmp";

    FILE *image = fopen(picture, "rb");

    if(image == NULL)
    {
        printf("Load picture error!\n");
        system("pause");
        exit(1);
    }

    BITMAPFILEHEADER bmpFHeader;
    BITMAPINFOHEADER bmpIHeader;
    fread(&bmpFHeader, sizeof(BITMAPFILEHEADER), 1, image);
    fread(&bmpIHeader, sizeof(BITMAPINFOHEADER), 1, image);

    int imgWidth = bmpIHeader.biWidth;
    int imgHeight = bmpIHeader.biHeight;

    int img_size = imgWidth * imgHeight * sizeof(int);

    int * imgeRedChannel_x = (int *)malloc(img_size);
    int * imgeGreenChannel_x = (int *)malloc(img_size);
    int * imgeBlueChannel_x = (int *)malloc(img_size);

    int * deviceInputRed;
    int * deviceInputGreen;
    int * deviceInputBlue;

    int * deviceOutputRd;
    int * deviceOutputGreen;
    int * deviceOutputBlue;

    for(int i = imgHeight-1; i>=0; i--)
    {
        for(int j = 0; j<imgWidth; j++)
        {

                fread(&(imgeGreenChannel_x[i * (imgWidth) + j]), sizeof(unsigned char), 1, image);
                fread(&(imgeBlueChannel_x[i * (imgWidth) + j]), sizeof(unsigned char), 1, image);
                fread(&(imgeRedChannel_x[i * (imgWidth) + j]), sizeof(unsigned char), 1, image);

        }
    }

    cudaMalloc((void **) &deviceInputRed, sizeof(int) * imgHeight * imgWidth);
    cudaMalloc((void **) &deviceInputBlue, sizeof(int) * imgHeight * imgWidth);
    cudaMalloc((void **) &deviceInputGreen, sizeof(int) * imgHeight * imgWidth);
    cudaMalloc((void **) &deviceOutputRd, sizeof(int) * imgHeight * imgWidth);
    cudaMalloc((void **) &deviceOutputBlue, sizeof(int) * imgHeight * imgWidth);
    cudaMalloc((void **) &deviceOutputGreen, sizeof(int) * imgHeight * imgWidth);

    int dimA = imgWidth*imgHeight;
    int numThreadsPerBlock = 256;
    int numBlocks = dimA / numThreadsPerBlock;
    int sharedMemSize = numThreadsPerBlock*sizeof(int);

    dim3 dimGrid(numBlocks);
    dim3 dimBlock(numThreadsPerBlock);

    cudaMemcpy(deviceInputRed,imgeRedChannel_x,sizeof(int) * imgHeight * imgWidth,cudaMemcpyHostToDevice);
    checkCUDAError("memcpy h-d r");
    cudaMemcpy(deviceInputGreen,imgeGreenChannel_x,sizeof(int) * imgHeight * imgWidth,cudaMemcpyHostToDevice);
    checkCUDAError("memcpy h-d g");
    cudaMemcpy(deviceInputBlue,imgeBlueChannel_x,sizeof(int) * imgHeight * imgWidth,cudaMemcpyHostToDevice);
    checkCUDAError("memcpy h-d b");

    median_filter<<< dimGrid , dimBlock, sharedMemSize>>> (deviceInputRed, deviceOutputRd, imgHeight, imgWidth);
    checkCUDAError("kernel invocation r");
    median_filter<<< dimGrid , dimBlock, sharedMemSize>>> (deviceInputGreen, deviceOutputGreen, imgHeight, imgWidth);
    checkCUDAError("kernel invocation g");
    median_filter<<< dimGrid , dimBlock, sharedMemSize>>> (deviceInputBlue, deviceOutputBlue, imgHeight, imgWidth);
    checkCUDAError("kernel invocation b");

    cudaMemcpy(imgeRedChannel_x, deviceOutputRd, imgHeight * imgWidth * sizeof(int), cudaMemcpyDeviceToHost);
    checkCUDAError("memcpy d-h r");
    cudaMemcpy(imgeGreenChannel_x, deviceOutputGreen, imgHeight * imgWidth * sizeof(int), cudaMemcpyDeviceToHost);
    checkCUDAError("memcpy d-h g");
    cudaMemcpy(imgeBlueChannel_x, deviceOutputBlue, imgHeight * imgWidth * sizeof(int), cudaMemcpyDeviceToHost);
    checkCUDAError("memcpy d-h b");

    cudaFree(deviceInputRed);
    cudaFree(deviceOutputRd);
    cudaFree(deviceInputGreen);
    cudaFree(deviceOutputGreen);
    cudaFree(deviceInputBlue);
    cudaFree(deviceOutputBlue);

    /*saving new picture*/
    fclose(image);

    char title[50]="after";
    strcat(title, ".bmp");

    remove(title);
    image = fopen(title,"wb");

    fwrite(&bmpFHeader, sizeof(BITMAPFILEHEADER), 1, image);
    fwrite(&bmpIHeader, sizeof(BITMAPINFOHEADER), 1, image);

    for(int i = imgHeight-1; i>=0; i--)
    {

        for(int j = 0; j<imgWidth; j++)
        {
            int b = imgeBlueChannel_x[i * (imgWidth) + j];
            int g = imgeGreenChannel_x[i * (imgWidth) + j];
            int r = imgeRedChannel_x[i * (imgWidth) + j]; 

            if(b>255)b=255;
            if(g>255)g=255;
            if(r>255)r=255;



            fwrite(&g, sizeof(unsigned char), 1, image);
            fwrite(&b, sizeof(unsigned char), 1, image);
            fwrite(&r, sizeof(unsigned char), 1, image);
        }
    }

    printf("Success!\n");
    fclose(image);
    system("pause");
    return 0;
}

Answer 1

鼻子变绿意味着你的代码中有溢出，但这很奇怪，因为中值滤波器永远不会产生溢出。你肯定有一个混乱的代码，内核没有多大意义，尤其是你正在做的大量额外工作。

在非线性过滤器中，我建议您首先尝试实施Min或Max过滤器以查看它们是否有效。这是来自CUVI CUDA库的最大过滤器的工作代码。您的Median内核应该与此无异：

__global__ void median_8u_c3( unsigned char* out,
                              unsigned int width,
                              unsigned int widthStep,
                              unsigned int height){

int xIndex = blockIdx.x*BLOCK_SIZE + threadIdx.x;
int  yIndex = blockIdx.y*BLOCK_SIZE + threadIdx.y;
int tid = yIndex * widthStep + (3*xIndex);

if(xIndex>=width|| yIndex>=height) return;

int limitX = anchorX + fHeight - 1;
int limitY = anchorY + fWidth - 1;

unsigned char MAX_R = 0 , MAX_G = 0, MAX_B = 0;  


 // Instead of Max filter code in the for loops below, you can have median code
for(Cuvi32s i=anchorX ; i<= limitX; i++)
    for(Cuvi32s j=anchorY ; j<= limitY; j++)
    {
        MAX_R = (tex2D(tex8,3*(xIndex + i)  , yIndex + j) > MAX_R) ? tex2D(tex8,3*(xIndex + i)  , yIndex + j) : MAX_R;
        MAX_G = (tex2D(tex8,3*(xIndex + i)+1, yIndex + j) > MAX_G) ? tex2D(tex8,3*(xIndex + i)+1, yIndex + j) : MAX_G;
        MAX_B = (tex2D(tex8,3*(xIndex + i)+2, yIndex + j) > MAX_B) ? tex2D(tex8,3*(xIndex + i)+2, yIndex + j) : MAX_B;
    }


out[tid] = MAX_R;
out[tid + 1] = MAX_G;
out[tid + 2] = MAX_B;
}

注意：我正在使用Textures的输入。

CUDA中位数过滤器无法正常工作

1 个答案: