在CUDA中将2D Canny Edge图像转换为1D边缘像素阵列 - 奇怪的行为

时间:2013-08-07 09:52:08

标签: image-processing cuda

我有一个CUDA内核,它采用边缘图像并处理它以创建边缘像素的较小的1D阵列。现在这是奇怪的行为。每次我运行内核并计算“d_nlist”中的边缘像素数量时(参见printf附近的代码),每次都得到更大的像素数,即使我使用相同的图像并完全停止程序并重新跑。因此,每次运行它都需要更长的时间才能运行,直到最终,它会抛出一个未捕获的异常。

我的问题是,如何阻止这种情况发生,以便每次运行内核时都能获得一致的结果?

我的设备是Geforce 620。

常量:

THREADS_X = 32
THREADS_Y = 4
PIXELS_PER_THREAD = 4
MAX_QUEUE_LENGTH = THREADS_X * THREADS_Y * PIXELS_PER_THREAD
IMG_WIDTH = 256
IMG_HEIGHT = 256
IMG_SIZE = IMG_WIDTH * IMG_HEIGHT
BLOCKS_X = IMG_WIDTH /(THREADS_X * PIXELS_PER_THREAD)
BLOCKS_Y = IMG_HEIGHT / THREADS_Y

内核如下:

__global__ void convert2DEdgeImageTo1DArray( unsigned char const * const image, 
unsigned int* const list, int* const glob_index ) {

unsigned int const x = blockIdx.x  * THREADS_X*PIXELS_PER_THREAD + threadIdx.x;
unsigned int const y = blockIdx.y  * THREADS_Y + threadIdx.y;

volatile int qindex = -1;
volatile __shared__ int sh_qindex[THREADS_Y];
volatile __shared__ int sh_qstart[THREADS_Y];
sh_qindex[threadIdx.y] = -1;

// Start by making an array
volatile __shared__ unsigned int sh_queue[MAX_QUEUE_LENGTH];

// Fill the queue
for(int i=0; i<PIXELS_PER_THREAD; i++)
{
    int const xx = i*THREADS_X + x;

    // Read one image pixel from global memory
    unsigned char const pixel = image[y*IMG_WIDTH + xx];
    unsigned int  const queue_val = (y << 16) + xx;

    if(pixel)
    {           
        do {
            qindex++;
            sh_qindex[threadIdx.y] = qindex;
            sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] = queue_val;
        } while (sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] != queue_val);
    }

    // Reload index from smem (last thread to write to smem will have updated it)
    qindex = sh_qindex[threadIdx.y];
}

// Let thread 0 reserve the space required in the global list
__syncthreads();
if(threadIdx.x == 0 && threadIdx.y == 0)
{
    // Find how many items are stored in each list
    int total_index = 0;
    #pragma unroll
    for(int i=0; i<THREADS_Y; i++)
    {
        sh_qstart[i] = total_index;
        total_index += (sh_qindex[i] + 1u);
    }

    // Calculate the offset in the global list
    unsigned int global_offset = atomicAdd(glob_index, total_index);
    #pragma unroll
    for(int i=0; i<THREADS_Y; i++)
    {
        sh_qstart[i] += global_offset;
    }
}
__syncthreads();

// Copy local queues to global queue
for(int i=0; i<=qindex; i+=THREADS_X)
{
    if(i + threadIdx.x > qindex)
        break;

    unsigned int qvalue = sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + i + threadIdx.x];
    list[sh_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
}
}

以下是调用内核的方法:

void call2DTo1DKernel(unsigned char const * const h_image)
{
    // Device side allocation
    unsigned char *d_image = NULL;
    unsigned int *d_list = NULL;
    int h_nlist, *d_nlist = NULL;
    cudaMalloc((void**)&d_image, sizeof(unsigned char)*IMG_SIZE);
    cudaMalloc((void**)&d_list, sizeof(unsigned int)*IMG_SIZE);
    cudaMalloc((void**)&d_nlist, sizeof(int));

    // Time measurement initialization
    cudaEvent_t start, stop, startio, stopio;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventCreate(&startio); 
    cudaEventCreate(&stopio);

    // Start timer w/ io
    cudaEventRecord(startio,0);

    // Copy image data to device
    cudaMemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*IMG_SIZE,    cudaMemcpyHostToDevice);

    // Start timer
    cudaEventRecord(start,0);

    // Kernel call
    // Phase 1 : Convert 2D binary image to 1D pixel array
    dim3 dimBlock1(THREADS_X, THREADS_Y);
    dim3 dimGrid1(BLOCKS_X, BLOCKS_Y);
    convert2DEdgeImageTo1DArray<<<dimGrid1, dimBlock1>>>(d_image, d_list, d_nlist);

    // Stop timer
    cudaEventRecord(stop,0);
    cudaEventSynchronize(stop);

    // Stop timer w/ io
    cudaEventRecord(stopio,0);
    cudaEventSynchronize(stopio);

    // Time measurement
    cudaEventElapsedTime(&et,start,stop);
    cudaEventElapsedTime(&etio,startio,stopio);

    // Time measurement deinitialization
    cudaEventDestroy(start); 
    cudaEventDestroy(stop);
    cudaEventDestroy(startio); 
    cudaEventDestroy(stopio);

    // Get list size
    cudaMemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudaMemcpyDeviceToHost);

    // Report on console
    printf("%d pixels processed...\n", h_nlist);

    // Device side dealloc
    cudaFree(d_image);
    cudaFree(d_space);
    cudaFree(d_list);
    cudaFree(d_nlist);
}

非常感谢您的帮助。

1 个答案:

答案 0 :(得分:1)

作为序言,让我建议一些有用的故障排除步骤:

  1. 使用proper cuda error checking
  2. 检测您的代码
  3. 使用cuda-memcheck运行您的代码,例如cuda-memcheck ./myapp
  4. 如果你执行上述步骤,你会发现你的内核失败了,失败与大小为4的全局写入有关。所以这会把你的注意力集中在内核的最后一段,从评论// Copy local queues to global queue

    关于您的代码,您至少有两个问题:

    1. 你的内核的最后一段中的寻址/索引,你正在将各个队列写入全局内存,这是搞砸了。我不打算为你调试这个。
    2. 您没有将d_nlist变量初始化为零。因此,当您对其进行原子添加时,您将您的值添加到垃圾值,当您重复该过程时,该值往往会增加。
    3. 以下是一些已删除问题的代码,(我没有尝试整理您的队列复制代码)并添加了错误检查。它为我产生了可重复的结果:

      $ cat t216.cu
      #include <stdio.h>
      #include <stdlib.h>
      
      #define THREADS_X 32
      #define THREADS_Y 4
      #define PIXELS_PER_THREAD 4
      #define MAX_QUEUE_LENGTH (THREADS_X*THREADS_Y*PIXELS_PER_THREAD)
      #define IMG_WIDTH 256
      #define IMG_HEIGHT 256
      #define IMG_SIZE (IMG_WIDTH*IMG_HEIGHT)
      #define BLOCKS_X (IMG_WIDTH/(THREADS_X*PIXELS_PER_THREAD))
      #define BLOCKS_Y (IMG_HEIGHT/THREADS_Y)
      
      #define cudaCheckErrors(msg) \
          do { \
              cudaError_t __err = cudaGetLastError(); \
              if (__err != cudaSuccess) { \
                  fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                      msg, cudaGetErrorString(__err), \
                      __FILE__, __LINE__); \
                  fprintf(stderr, "*** FAILED - ABORTING\n"); \
                  exit(1); \
              } \
          } while (0)
      
      __global__ void convert2DEdgeImageTo1DArray( unsigned char const * const image,
      unsigned int* const list, int* const glob_index ) {
      
      unsigned int const x = blockIdx.x  * THREADS_X*PIXELS_PER_THREAD + threadIdx.x;
      unsigned int const y = blockIdx.y  * THREADS_Y + threadIdx.y;
      
      volatile int qindex = -1;
      volatile __shared__ int sh_qindex[THREADS_Y];
      volatile __shared__ int sh_qstart[THREADS_Y];
      sh_qindex[threadIdx.y] = -1;
      
      // Start by making an array
      volatile __shared__ unsigned int sh_queue[MAX_QUEUE_LENGTH];
      
      // Fill the queue
      for(int i=0; i<PIXELS_PER_THREAD; i++)
      {
          int const xx = i*THREADS_X + x;
      
          // Read one image pixel from global memory
          unsigned char const pixel = image[y*IMG_WIDTH + xx];
          unsigned int  const queue_val = (y << 16) + xx;
      
          if(pixel)
          {
              do {
                  qindex++;
                  sh_qindex[threadIdx.y] = qindex;
                  sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] = queue_val;
              } while (sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + qindex] != queue_val);
          }
      
          // Reload index from smem (last thread to write to smem will have updated it)
          qindex = sh_qindex[threadIdx.y];
      }
      
      // Let thread 0 reserve the space required in the global list
      __syncthreads();
      if(threadIdx.x == 0 && threadIdx.y == 0)
      {
          // Find how many items are stored in each list
          int total_index = 0;
          #pragma unroll
          for(int i=0; i<THREADS_Y; i++)
          {
              sh_qstart[i] = total_index;
              total_index += (sh_qindex[i] + 1u);
          }
      
          // Calculate the offset in the global list
          unsigned int global_offset = atomicAdd(glob_index, total_index);
          #pragma unroll
          for(int i=0; i<THREADS_Y; i++)
          {
              sh_qstart[i] += global_offset;
          }
      
      }
      __syncthreads();
      
      // Copy local queues to global queue
      /*
      for(int i=0; i<=qindex; i+=THREADS_X)
      {
          if(i + threadIdx.x > qindex)
              break;
      
          unsigned int qvalue = sh_queue[threadIdx.y*THREADS_X*PIXELS_PER_THREAD + i + threadIdx.x];
          list[sh_qstart[threadIdx.y] + i + threadIdx.x] = qvalue;
      }
      */
      }
      
      void call2DTo1DKernel(unsigned char const * const h_image)
      {
          // Device side allocation
          unsigned char *d_image = NULL;
          unsigned int *d_list = NULL;
          int h_nlist=0, *d_nlist = NULL;
          cudaMalloc((void**)&d_image, sizeof(unsigned char)*IMG_SIZE);
          cudaMalloc((void**)&d_list, sizeof(unsigned int)*IMG_SIZE);
          cudaMalloc((void**)&d_nlist, sizeof(int));
          cudaCheckErrors("cudamalloc fail");
      
          // Time measurement initialization
          cudaEvent_t start, stop, startio, stopio;
          cudaEventCreate(&start);
          cudaEventCreate(&stop);
          cudaEventCreate(&startio);
          cudaEventCreate(&stopio);
          float et, etio;
      
          // Start timer w/ io
          cudaEventRecord(startio,0);
          cudaMemcpy(d_nlist, &h_nlist, sizeof(int), cudaMemcpyHostToDevice);
          // Copy image data to device
          cudaMemcpy((void*)d_image, (void*)h_image, sizeof(unsigned char)*IMG_SIZE,    cudaMemcpyHostToDevice);
          cudaCheckErrors("cudamemcpy 1");
          // Start timer
          cudaEventRecord(start,0);
      
          // Kernel call
          // Phase 1 : Convert 2D binary image to 1D pixel array
          dim3 dimBlock1(THREADS_X, THREADS_Y);
          dim3 dimGrid1(BLOCKS_X, BLOCKS_Y);
          convert2DEdgeImageTo1DArray<<<dimGrid1, dimBlock1>>>(d_image, d_list, d_nlist);
          cudaDeviceSynchronize();
          cudaCheckErrors("kernel fail");
          // Stop timer
          cudaEventRecord(stop,0);
          cudaEventSynchronize(stop);
      
          // Stop timer w/ io
          cudaEventRecord(stopio,0);
          cudaEventSynchronize(stopio);
      
          // Time measurement
          cudaEventElapsedTime(&et,start,stop);
          cudaEventElapsedTime(&etio,startio,stopio);
      
          // Time measurement deinitialization
          cudaEventDestroy(start);
          cudaEventDestroy(stop);
          cudaEventDestroy(startio);
          cudaEventDestroy(stopio);
      
          // Get list size
          cudaMemcpy((void*)&h_nlist, (void*)d_nlist, sizeof(int), cudaMemcpyDeviceToHost);
          cudaCheckErrors("cudaMemcpy 2");
          // Report on console
          printf("%d pixels processed...\n", h_nlist);
      
          // Device side dealloc
          cudaFree(d_image);
      //    cudaFree(d_space);
          cudaFree(d_list);
          cudaFree(d_nlist);
      }
      
      int main(){
      
        unsigned char *image;
      
        image = (unsigned char *)malloc(IMG_SIZE * sizeof(unsigned char));
        if (image == 0) {printf("malloc fail\n"); return 0;}
      
        for (int i =0 ; i<IMG_SIZE; i++)
          image[i] = rand()%2;
      
        call2DTo1DKernel(image);
        call2DTo1DKernel(image);
        call2DTo1DKernel(image);
        call2DTo1DKernel(image);
        call2DTo1DKernel(image);
        cudaCheckErrors("some error");
        return 0;
      }
      
      $ nvcc -arch=sm_20 -O3 -o t216 t216.cu
      $ ./t216
      32617 pixels processed...
      32617 pixels processed...
      32617 pixels processed...
      32617 pixels processed...
      32617 pixels processed...
      $ ./t216
      32617 pixels processed...
      32617 pixels processed...
      32617 pixels processed...
      32617 pixels processed...
      32617 pixels processed...
      $