我自己学习了CUDA,并尝试使用简单的中值滤波器进行图像处理。这就是我想出来的,但我似乎无法从它出来的图像中获得任何好的结果。例如,输出图像相对没有噪声,但图像的饱和度似乎更高,当我尝试维基百科的teddy bear图片时,他的鼻子由于某种原因变绿了。考虑到任何新想法,我感到非常沮丧,所以如果有人能在代码中看到问题,我将非常感激。谢谢!
这是内核函数:
__global__ void median_filter(int *input, int *output, int IMAGE_W, int IMAGE_H){
__shared__ float window[BLOCK_W*BLOCK_H][9];
int x, y, tid;
int i, j, iMin, temp;
x = blockIdx.x*blockDim.x + threadIdx.x;
y = blockIdx.y*blockDim.y + threadIdx.y;
tid = threadIdx.y*blockDim.y + threadIdx.x;
if(x>=IMAGE_W && y>=IMAGE_H)
return;
/* setting 3x3 window elements for median */
if(y==0 && x==0)
window[tid][0] = input[y*IMAGE_W+x];
else if(y==0 && x!=0)
window[tid][0] = input[y*IMAGE_W+x-1];
else if(y!=0 && x==0)
window[tid][0] = input[(y-1)*IMAGE_W+x];
else
window[tid][0] = input[(y-1)*IMAGE_W+x-1];
window[tid][1] = (y==0)?input[y*IMAGE_W+x]:input[(y-1)*IMAGE_W+x];
if(y==0 && x==IMAGE_W-1)
window[tid][2] = input[y*IMAGE_W+x];
else if(y!=0 && x==IMAGE_W-1)
window[tid][2] = input[(y-1)*IMAGE_W+x];
else if(y==0 && x!=IMAGE_W-1)
window[tid][2] = input[(y-1)*IMAGE_W+x+1];
else
window[tid][2] = input[(y-1)*IMAGE_W+x+1];
window[tid][3] = (x==0)?input[y*IMAGE_W+x]:input[y*IMAGE_W+x-1];
window[tid][4] = input[y*IMAGE_W+x];
window[tid][5] = (x==IMAGE_W-1)?input[y*IMAGE_W+x]:input[y*IMAGE_W+x+1];
if(y==IMAGE_H-1 && x==0)
window[tid][6] = input[y*IMAGE_W+x];
else if(y!=IMAGE_H-1 && x==0)
window[tid][6] = input[(y+1)*IMAGE_W+x];
else if(y==IMAGE_H-1 && x!=0)
window[tid][6] = input[y*IMAGE_W+x-1];
else
window[tid][6] = input[(y+1)*IMAGE_W+x-1];
window[tid][7] = (y==IMAGE_H-1)?input[y*IMAGE_W+x]:input[(y+1)*IMAGE_W+x];
if(y==IMAGE_H-1 && x==IMAGE_W-1)
window[tid][8] = input[y*IMAGE_W+x];
else if(y!=IMAGE_H-1 && x==IMAGE_W-1)
window[tid][8] = input[(y+1)*IMAGE_W+x];
else if(y==IMAGE_H-1 && x!=IMAGE_W-1)
window[tid][8] = input[y*IMAGE_W+x+1];
else
window[tid][8] = input[(y+1)*IMAGE_W+x+1];
__syncthreads();
/* sorting window to find median */
for(j=0; j<8; j++){
iMin = j;
for(i=j+1; i<9; i++){
if(window[tid][i] < window[tid][iMin]){
iMin = i;
}
}
if(iMin != j){
temp = window[tid][iMin];
window[tid][iMin] = window[tid][j];
window[tid][j] = temp;
}
__syncthreads();
}
output[y*IMAGE_W + x] = window[tid][4];
}
主要功能:
int main(){
/*loading picture*/
char picture[50] = "before.bmp";
FILE *image = fopen(picture, "rb");
if(image == NULL)
{
printf("Load picture error!\n");
system("pause");
exit(1);
}
BITMAPFILEHEADER bmpFHeader;
BITMAPINFOHEADER bmpIHeader;
fread(&bmpFHeader, sizeof(BITMAPFILEHEADER), 1, image);
fread(&bmpIHeader, sizeof(BITMAPINFOHEADER), 1, image);
int imgWidth = bmpIHeader.biWidth;
int imgHeight = bmpIHeader.biHeight;
int img_size = imgWidth * imgHeight * sizeof(int);
int * imgeRedChannel_x = (int *)malloc(img_size);
int * imgeGreenChannel_x = (int *)malloc(img_size);
int * imgeBlueChannel_x = (int *)malloc(img_size);
int * deviceInputRed;
int * deviceInputGreen;
int * deviceInputBlue;
int * deviceOutputRd;
int * deviceOutputGreen;
int * deviceOutputBlue;
for(int i = imgHeight-1; i>=0; i--)
{
for(int j = 0; j<imgWidth; j++)
{
fread(&(imgeGreenChannel_x[i * (imgWidth) + j]), sizeof(unsigned char), 1, image);
fread(&(imgeBlueChannel_x[i * (imgWidth) + j]), sizeof(unsigned char), 1, image);
fread(&(imgeRedChannel_x[i * (imgWidth) + j]), sizeof(unsigned char), 1, image);
}
}
cudaMalloc((void **) &deviceInputRed, sizeof(int) * imgHeight * imgWidth);
cudaMalloc((void **) &deviceInputBlue, sizeof(int) * imgHeight * imgWidth);
cudaMalloc((void **) &deviceInputGreen, sizeof(int) * imgHeight * imgWidth);
cudaMalloc((void **) &deviceOutputRd, sizeof(int) * imgHeight * imgWidth);
cudaMalloc((void **) &deviceOutputBlue, sizeof(int) * imgHeight * imgWidth);
cudaMalloc((void **) &deviceOutputGreen, sizeof(int) * imgHeight * imgWidth);
int dimA = imgWidth*imgHeight;
int numThreadsPerBlock = 256;
int numBlocks = dimA / numThreadsPerBlock;
int sharedMemSize = numThreadsPerBlock*sizeof(int);
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
cudaMemcpy(deviceInputRed,imgeRedChannel_x,sizeof(int) * imgHeight * imgWidth,cudaMemcpyHostToDevice);
checkCUDAError("memcpy h-d r");
cudaMemcpy(deviceInputGreen,imgeGreenChannel_x,sizeof(int) * imgHeight * imgWidth,cudaMemcpyHostToDevice);
checkCUDAError("memcpy h-d g");
cudaMemcpy(deviceInputBlue,imgeBlueChannel_x,sizeof(int) * imgHeight * imgWidth,cudaMemcpyHostToDevice);
checkCUDAError("memcpy h-d b");
median_filter<<< dimGrid , dimBlock, sharedMemSize>>> (deviceInputRed, deviceOutputRd, imgHeight, imgWidth);
checkCUDAError("kernel invocation r");
median_filter<<< dimGrid , dimBlock, sharedMemSize>>> (deviceInputGreen, deviceOutputGreen, imgHeight, imgWidth);
checkCUDAError("kernel invocation g");
median_filter<<< dimGrid , dimBlock, sharedMemSize>>> (deviceInputBlue, deviceOutputBlue, imgHeight, imgWidth);
checkCUDAError("kernel invocation b");
cudaMemcpy(imgeRedChannel_x, deviceOutputRd, imgHeight * imgWidth * sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("memcpy d-h r");
cudaMemcpy(imgeGreenChannel_x, deviceOutputGreen, imgHeight * imgWidth * sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("memcpy d-h g");
cudaMemcpy(imgeBlueChannel_x, deviceOutputBlue, imgHeight * imgWidth * sizeof(int), cudaMemcpyDeviceToHost);
checkCUDAError("memcpy d-h b");
cudaFree(deviceInputRed);
cudaFree(deviceOutputRd);
cudaFree(deviceInputGreen);
cudaFree(deviceOutputGreen);
cudaFree(deviceInputBlue);
cudaFree(deviceOutputBlue);
/*saving new picture*/
fclose(image);
char title[50]="after";
strcat(title, ".bmp");
remove(title);
image = fopen(title,"wb");
fwrite(&bmpFHeader, sizeof(BITMAPFILEHEADER), 1, image);
fwrite(&bmpIHeader, sizeof(BITMAPINFOHEADER), 1, image);
for(int i = imgHeight-1; i>=0; i--)
{
for(int j = 0; j<imgWidth; j++)
{
int b = imgeBlueChannel_x[i * (imgWidth) + j];
int g = imgeGreenChannel_x[i * (imgWidth) + j];
int r = imgeRedChannel_x[i * (imgWidth) + j];
if(b>255)b=255;
if(g>255)g=255;
if(r>255)r=255;
fwrite(&g, sizeof(unsigned char), 1, image);
fwrite(&b, sizeof(unsigned char), 1, image);
fwrite(&r, sizeof(unsigned char), 1, image);
}
}
printf("Success!\n");
fclose(image);
system("pause");
return 0;
}
答案 0 :(得分:1)
鼻子变绿意味着你的代码中有溢出,但这很奇怪,因为中值滤波器永远不会产生溢出。你肯定有一个混乱的代码,内核没有多大意义,尤其是你正在做的大量额外工作。
在非线性过滤器中,我建议您首先尝试实施Min或Max过滤器以查看它们是否有效。这是来自CUVI CUDA库的最大过滤器的工作代码。您的Median内核应该与此无异:
__global__ void median_8u_c3( unsigned char* out,
unsigned int width,
unsigned int widthStep,
unsigned int height){
int xIndex = blockIdx.x*BLOCK_SIZE + threadIdx.x;
int yIndex = blockIdx.y*BLOCK_SIZE + threadIdx.y;
int tid = yIndex * widthStep + (3*xIndex);
if(xIndex>=width|| yIndex>=height) return;
int limitX = anchorX + fHeight - 1;
int limitY = anchorY + fWidth - 1;
unsigned char MAX_R = 0 , MAX_G = 0, MAX_B = 0;
// Instead of Max filter code in the for loops below, you can have median code
for(Cuvi32s i=anchorX ; i<= limitX; i++)
for(Cuvi32s j=anchorY ; j<= limitY; j++)
{
MAX_R = (tex2D(tex8,3*(xIndex + i) , yIndex + j) > MAX_R) ? tex2D(tex8,3*(xIndex + i) , yIndex + j) : MAX_R;
MAX_G = (tex2D(tex8,3*(xIndex + i)+1, yIndex + j) > MAX_G) ? tex2D(tex8,3*(xIndex + i)+1, yIndex + j) : MAX_G;
MAX_B = (tex2D(tex8,3*(xIndex + i)+2, yIndex + j) > MAX_B) ? tex2D(tex8,3*(xIndex + i)+2, yIndex + j) : MAX_B;
}
out[tid] = MAX_R;
out[tid + 1] = MAX_G;
out[tid + 2] = MAX_B;
}
注意:我正在使用Textures的输入。