Question

我正在尝试使用CUDA C / C ++编程应用高斯图像模糊处理。 CPU部件运行良好，产生了良好的效果。但是，在GPU的情况下，它仅产生黑色图像。我不确定问题出在哪里。这是我的完整代码。

如何解决此问题？

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <iostream>
#include <iomanip>
#include <fstream>

#define IMW 1600
#define IMH 1600
#define CHANNEL_NUM 3
#define IMAGE_BUFFER_SIZE (IMW*IMH*CHANNEL_NUM)
#define BLOCKX 16
#define BLOCKY BLOCKX
#define BLUR_DEGREE 3
#define BLUR_SIZE 1
unsigned int width, height;

int hmask[3][3] = { 1, 2, 1,
2, 4, 2,
1, 2, 1
};

#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL



__global__ void blurKernel(unsigned char * in, unsigned char * out, int w, int h) 
{ 
    int Col = blockIdx.x * blockDim.x + threadIdx.x; 
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    if (Col < w && Row < h) 
    { 
        int pixVal = 0; 
        int pixels = 0;
        // Get the average of the surrounding BLUR_SIZE x BLUR_SIZE box 
            for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE+1; ++blurRow) 
                { 
                    for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE+1; ++blurCol) 
                    {
                        int curRow = Row + blurRow; 
                        int curCol = Col + blurCol; 

                        // Verify we have a valid image pixel 
                        if(curRow > -1 && curRow < h && curCol > -1 && curCol < w) 
                            { 
                                pixVal += in[curRow * w + curCol]; 
                                pixels++; // Keep track of number of pixels in the avg 
                            } 
                    } 
                } 
            // Write our new pixel value out 
        out[Row * w + Col] = (unsigned char)(pixVal / pixels); 
    } 
}

int  main(int argc, char **argv)
{
/************ Setup work ***********************/
  unsigned char *d_resultPixels;
  unsigned char *h_resultPixels;
  unsigned char *h_devicePixels;

  unsigned char *h_pixels = NULL;
  unsigned char *d_pixels = NULL;



  int nBlurDegree;
  int imageSize = sizeof(unsigned char) * IMAGE_BUFFER_SIZE;

  h_pixels = (unsigned char *)malloc(imageSize);
  h_resultPixels = (unsigned char *)malloc(imageSize);
  h_devicePixels = (unsigned char *)malloc(imageSize);

  int width1, height1, bpp;

  h_pixels = stbi_load("rana_1600_1600.png", &width1, &height1, &bpp, CHANNEL_NUM);

  width = width1;
  height = height1;

  printf("Image size: %u\n", imageSize);
  printf("Image width: %u\n", width);
  printf("Image height: %u\n", height);

  //memcpy(h_devicePixels, h_pixels, imageSize);



/************************** Start host processing ************************/
  unsigned long long cputime = dtime_usec(0);
  // cpu code here.....
  cputime = dtime_usec(cputime);

  stbi_write_png("host_output.png", width, height, CHANNEL_NUM, h_resultPixels, width*CHANNEL_NUM);

/************************** End host processing **************************/

/************************** Start device processing **********************/

  // allocate memory on the GPU for the output image
  cudaMalloc((void**)&d_pixels, imageSize);
  cudaMalloc((void**)&d_resultPixels, imageSize);

  cudaMemcpy(d_pixels, h_pixels, imageSize, cudaMemcpyHostToDevice);
  checkCUDAError("CUDA memcpy to device");

  dim3 blocksPerGrid(IMW / 16, 1);
  dim3 threadsPerBlock(16, 1);

  unsigned long long gputime = dtime_usec(0);

  for (nBlurDegree = 0; nBlurDegree < BLUR_DEGREE; nBlurDegree++)
  {
    cudaMemset(d_resultPixels, 0, imageSize);

    blurKernel << <blocksPerGrid, threadsPerBlock >> >(d_pixels, d_resultPixels, width, height);

    cudaMemcpy(d_pixels, d_resultPixels, imageSize, cudaMemcpyDeviceToDevice);

    cudaThreadSynchronize();
  }

  cudaDeviceSynchronize();
  gputime = dtime_usec(gputime);


  cudaMemcpy(h_devicePixels, d_resultPixels, imageSize, cudaMemcpyDeviceToHost);


  printf("GPU time: %f seconds, CPU time: %f seconds\n", gputime/(float)USECPSEC, cputime/(float)USECPSEC);

  printf("Speedup: %f\n", (cputime/(float)USECPSEC)/(gputime/(float)USECPSEC));

  validate(h_pixels, h_devicePixels, imageSize);

  stbi_write_png("device_output.png", width, height, CHANNEL_NUM, h_devicePixels, width*CHANNEL_NUM);


/************************** End device processing ************************/

// Release resources
  cudaFree(d_pixels);
  cudaFree(d_resultPixels);

  //stbi_image_free(h2_pixels);

  free(h_devicePixels);
  free(h_pixels);
  free(h_resultPixels);

  return 0;
} // End main

我需要有关如何获取GPU输出图像“ device_output.png”的帮助。

使用CUDA C / C ++的高斯图像模糊

0 个答案: