Question

我正在尝试完成udacity“并行编程简介”课程的作业，我被困在第二个任务，基本上是使用CUDA将高斯模糊蒙版应用于图像。我想通过利用共享内存来有效地做到这一点。我解决“边界问题像素”的想法是发布比块中实际像素数更多的线程：例如，如果我将输入图像划分为16x16的有效像素块，我有一个9x9大小的掩码然后我的实际块大小将是（对于x和y）：16 + 2 *（9/2）= 24.这样我就在一个块中启动24个线程，以便“外部”线程将仅用于将输入img中的像素加载到共享内存，而“内部”线程对应于实际执行计算的活动像素（此外还有共享内存中的缓存）。

由于某种原因，它不起作用。从附带的代码中可以看出，我能够将像素缓存到共享内存中，但是在计算过程中出现了一些非常错误的东西，并且我附加了一个我得到的糟糕结果的图像。

               __global__ void gaussian_blur(const unsigned char* const inputChannel,
               unsigned char* const outputChannel,
               int numRows, int numCols,
               const float* const filter, const int filterWidth)
               {

int filter_radius = (int)(filterWidth / 2); //getting the filter "radius"

int x = blockDim.x*blockIdx.x+threadIdx.x;
int y = blockDim.y*blockIdx.y+threadIdx.y;

if(x>=(numCols+filter_radius) || y>=(numRows+filter_radius)) 
    return;

int px = x-filter_radius;
int py = y-filter_radius;

//clamping

if(px<0) px = 0;
if(py<0) py = 0;
//if(px>=numCols) px = numCols-1;
// if(py>=numRows) py = numRows-1;

 __shared__ unsigned char tile[(16+8)*(16+8)]; //16 active pixels + 2*filter_radius

 tile[threadIdx.y*24+threadIdx.x] = inputChannel[py*numCols+px];

 __syncthreads();  

//Here everything is working fine: if I do
//  outputChannel[py*numCols+px] = tile[threadIdx.y*24+threadIdx.x]; 
//then I am able to see the perfect reconstruction of the input image.

//caching the filter
__shared__ float t_filter[81]; //9x9 conv mask

if(threadIdx.x==0 && threadIdx.y==0)
{
    for(int i=0; i<81; i++)
        t_filter[i] = filter[i];
}

__syncthreads();


//I am checking the threadIdx of the threads and I am performing the mask computation
//only to those threads that are pointing to active pixels:
//i.e. all the threads whose id is greater or equal to the filter radius,
//but smaller than the whole block of active pixels will perform the computation.
//filter_radius = filterWidth/2 = 9/2 = 4
//blockDim.x or y = 16 + filterWidth*2 = 16+8 = 24
//active pixel index limit = filter_radius+16 = 4+16 = 20
//is that correct?


if(  
     threadIdx.y>=filter_radius && threadIdx.x>=filter_radius &&
     threadIdx.x < 20 && threadIdx.y < 20
  )
{ 

    float value = 0.0;

    for(int i=-filter_radius; i<=filter_radius; i++)
        for(int j=-filter_radius; j<=filter_radius; j++)
        {
            int fx = i+filter_radius;
            int fy = j+filter_radius;

            int ty = threadIdx.y+i;
            int tx = threadIdx.x+j;

            value += ((float)tile[ty*24+tx])*t_filter[fy*filterWidth+fx];
        }
    outputChannel[py*numCols+px] = (unsigned char) value; 
}

输出图片：http://i.stack.imgur.com/EMu5M.png

编辑：添加内核调用：

int filter_radius = (int) (filterWidth / 2);
    blockSize.x = 16 + 2*filter_radius;
    blockSize.y = 16 + 2*filter_radius;
    gridSize.x = numCols/16+1;
    gridSize.y = numRows/16+1;

    printf("\n grx %d gry %d \n", blockSize.x, blockSize.y );

    gaussian_blur<<<gridSize, blockSize>>>(d_red, d_redBlurred, numRows,numCols, d_filter, filterWidth);
    gaussian_blur<<<gridSize, blockSize>>>(d_green, d_greenBlurred, numRows,numCols, d_filter, filterWidth);
    gaussian_blur<<<gridSize, blockSize>>>(d_blue, d_blueBlurred, numRows,numCols, d_filter, filterWidth);

    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

     blockSize.x = 32;   gridSize.x = numCols/32+1;
     blockSize.y = 32;   gridSize.y = numRows/32+1;

  // Now we recombine your results. We take care of launching this kernel for you.
  //
  // NOTE: This kernel launch depends on the gridSize and blockSize variables,
  // which you must set yourself.
  recombineChannels<<<gridSize, blockSize>>>(d_redBlurred,
                                             d_greenBlurred,
                                             d_blueBlurred,
                                             d_outputImageRGBA,
                                             numRows,
                                             numCols);
  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

EDIT bis：

为了编译和运行所有其他必要的代码可以在这里找到： https://github.com/udacity/cs344/tree/master/Problem%20Sets/Problem%20Set%202 上面的内核应该在student_func.cu文件中编码。

Answer 1

在您的实现中，每个块永远不会计算边界（在边缘的一个滤波器半径内）像素的模糊。这意味着您希望块重叠，以便覆盖边界。如果查看每个块的x索引的域

int x = blockDim.x*blockIdx.x+threadIdx.x;

鉴于您上面的特定内核执行，我们将

blockIdx.x = 0: x = [0,23]
blockIdx.x = 1: x = [24,46]
... etc

正如您所看到的，每个块都会考虑图像的一个独特部分，但是您告诉每个块不要在边界上进行计算。这意味着计算中省略了每个块的边界（因此图像中的黑色网格）。

你需要用

之类的东西来计算你的指数

int x = (blockDim.x-2*filter_radius)*blockIdx.x+threadIdx.x;

使块重叠。现在我们的x索引的域名看起来像

blockIdx.x = 0: x = [0,23]
blockIdx.x = 1: x = [16,39]
... etc

在共享内存中使用高斯掩码在cuda内核中

1 个答案: