我正在尝试使用CUDA C / C ++编程应用高斯图像模糊处理。 CPU部件运行良好,产生了良好的效果。但是,在GPU的情况下,它仅产生黑色图像。我不确定问题出在哪里。这是我的完整代码。
如何解决此问题?
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <iostream>
#include <iomanip>
#include <fstream>
#define IMW 1600
#define IMH 1600
#define CHANNEL_NUM 3
#define IMAGE_BUFFER_SIZE (IMW*IMH*CHANNEL_NUM)
#define BLOCKX 16
#define BLOCKY BLOCKX
#define BLUR_DEGREE 3
#define BLUR_SIZE 1
unsigned int width, height;
int hmask[3][3] = { 1, 2, 1,
2, 4, 2,
1, 2, 1
};
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
__global__ void blurKernel(unsigned char * in, unsigned char * out, int w, int h)
{
int Col = blockIdx.x * blockDim.x + threadIdx.x;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
if (Col < w && Row < h)
{
int pixVal = 0;
int pixels = 0;
// Get the average of the surrounding BLUR_SIZE x BLUR_SIZE box
for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE+1; ++blurRow)
{
for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE+1; ++blurCol)
{
int curRow = Row + blurRow;
int curCol = Col + blurCol;
// Verify we have a valid image pixel
if(curRow > -1 && curRow < h && curCol > -1 && curCol < w)
{
pixVal += in[curRow * w + curCol];
pixels++; // Keep track of number of pixels in the avg
}
}
}
// Write our new pixel value out
out[Row * w + Col] = (unsigned char)(pixVal / pixels);
}
}
int main(int argc, char **argv)
{
/************ Setup work ***********************/
unsigned char *d_resultPixels;
unsigned char *h_resultPixels;
unsigned char *h_devicePixels;
unsigned char *h_pixels = NULL;
unsigned char *d_pixels = NULL;
int nBlurDegree;
int imageSize = sizeof(unsigned char) * IMAGE_BUFFER_SIZE;
h_pixels = (unsigned char *)malloc(imageSize);
h_resultPixels = (unsigned char *)malloc(imageSize);
h_devicePixels = (unsigned char *)malloc(imageSize);
int width1, height1, bpp;
h_pixels = stbi_load("rana_1600_1600.png", &width1, &height1, &bpp, CHANNEL_NUM);
width = width1;
height = height1;
printf("Image size: %u\n", imageSize);
printf("Image width: %u\n", width);
printf("Image height: %u\n", height);
//memcpy(h_devicePixels, h_pixels, imageSize);
/************************** Start host processing ************************/
unsigned long long cputime = dtime_usec(0);
// cpu code here.....
cputime = dtime_usec(cputime);
stbi_write_png("host_output.png", width, height, CHANNEL_NUM, h_resultPixels, width*CHANNEL_NUM);
/************************** End host processing **************************/
/************************** Start device processing **********************/
// allocate memory on the GPU for the output image
cudaMalloc((void**)&d_pixels, imageSize);
cudaMalloc((void**)&d_resultPixels, imageSize);
cudaMemcpy(d_pixels, h_pixels, imageSize, cudaMemcpyHostToDevice);
checkCUDAError("CUDA memcpy to device");
dim3 blocksPerGrid(IMW / 16, 1);
dim3 threadsPerBlock(16, 1);
unsigned long long gputime = dtime_usec(0);
for (nBlurDegree = 0; nBlurDegree < BLUR_DEGREE; nBlurDegree++)
{
cudaMemset(d_resultPixels, 0, imageSize);
blurKernel << <blocksPerGrid, threadsPerBlock >> >(d_pixels, d_resultPixels, width, height);
cudaMemcpy(d_pixels, d_resultPixels, imageSize, cudaMemcpyDeviceToDevice);
cudaThreadSynchronize();
}
cudaDeviceSynchronize();
gputime = dtime_usec(gputime);
cudaMemcpy(h_devicePixels, d_resultPixels, imageSize, cudaMemcpyDeviceToHost);
printf("GPU time: %f seconds, CPU time: %f seconds\n", gputime/(float)USECPSEC, cputime/(float)USECPSEC);
printf("Speedup: %f\n", (cputime/(float)USECPSEC)/(gputime/(float)USECPSEC));
validate(h_pixels, h_devicePixels, imageSize);
stbi_write_png("device_output.png", width, height, CHANNEL_NUM, h_devicePixels, width*CHANNEL_NUM);
/************************** End device processing ************************/
// Release resources
cudaFree(d_pixels);
cudaFree(d_resultPixels);
//stbi_image_free(h2_pixels);
free(h_devicePixels);
free(h_pixels);
free(h_resultPixels);
return 0;
} // End main
我需要有关如何获取GPU输出图像“ device_output.png”的帮助。