Question

我正在制作一个程序来模仿CUDA中的16位灰度图像。

在我的程序中，如果我使用sigma = 20或30的高斯模糊函数，则需要花费很多时间，而sigma = 2.0或3.0则需要很快。

我在一些网站上看到，使用FFT的Guaussian模糊适用于大内核大小或大sigma值：

这是真的吗？
我应该使用哪种算法：简单的高斯模糊或FFT的高斯模糊？

我的Guassian Blur代码如下。在我的代码中，是否有错误？

在这里输入代码

__global__ 
void gaussian_blur(
    unsigned short* const       blurredChannel,                     // return value: blurred channel (either red, green, or blue)
    const unsigned short* const inputChannel,                       // red, green, or blue channel from the original image
    int                         rows, 
    int                         cols,
    const float* const          filterWeight,                       // gaussian filter weights. The weights look like a bell shape.
    int                         filterWidth                         // number of pixels in x and y directions for calculating average blurring
    )
{
    int r           =  blockIdx.y * blockDim.y + threadIdx.y;       // current row
    int c           =  blockIdx.x * blockDim.x + threadIdx.x;       // current column


    if ((r >= rows) || (c >= cols))
    {
        return;
    }

    int           half   = filterWidth / 2;
    float         blur   = 0.f;                             // will contained blurred value
    int           width  = cols - 1;
    int           height = rows - 1;

    for (int i = -half; i <= half; ++i)                 // rows
    {
        for (int j = -half; j <= half; ++j)             // columns
        {
            // Clamp filter to the image border
            int     h       = min(max(r + i, 0), height);
            int     w       = min(max(c + j, 0), width);

            // Blur is a product of current pixel value and weight of that pixel.
            // Remember that sum of all weights equals to 1, so we are averaging sum of all pixels by their weight.
            int     idx     = w + cols * h;                                         // current pixel index
            float   pixel   = static_cast<float>(inputChannel[idx]);

                    idx     = (i + half) * filterWidth + j + half;
            float   weight  = filterWeight[idx];

            blur += pixel * weight;
        }
    }

    blurredChannel[c + r * cols] = static_cast<unsigned short>(blur);
}





void createFilter(float *gKernel,double sigma,int radius)
{

    double r, s = 2.0 * sigma * sigma;

    // sum is for normalization
    double sum = 0.0;

    // generate 9*9 kernel
    int m=0;
    for (int x = -radius; x <= radius; x++)
    {
        for(int y = -radius; y <= radius; y++)
        {
            r = std::sqrtf(x*x + y*y);
            gKernel[m] = (exp(-(r*r)/s))/(3.14 * s);
            sum += gKernel[m];
            m++;
        }
    }
 m=0;
    // normalize the Kernel
    for(int i = 0; i < (radius*2 +1); ++i)
        for(int j = 0; j < (radius*2 +1); ++j)
            gKernel[m++] /= sum;


}


int main()
{

    cudaError_t cudaStatus;
    const int size =81;
    float gKernel[size];

    float *dev_p=0;
    cudaStatus =  cudaMalloc((void**)&dev_p, size * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    createFilter(gKernel,20.0,4);

    cudaStatus = cudaMemcpy(dev_p, gKernel, size* sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    /*  i read image Buffere in unsigned short that code is not added here ,becouse it is large , and copy image data of buffere from host to device*/

    /* So, suppose i have unsigned short *d_img which contain image data */

    cudaMalloc( (void**)&d_img, length* sizeof(unsigned short));
    cudaMalloc( (void**)&d_blur_img, length* sizeof(unsigned short));

    static const int BLOCK_WIDTH = 32;
    int image_width=1580.0,image_height=1050.0;

    int x = static_cast<int>(ceilf(static_cast<float>(image_width) / BLOCK_WIDTH));
    int y = static_cast<int>(ceilf(static_cast<float>((image_height) ) / BLOCK_WIDTH));

    const dim3 grid (x, y, 1);                              // number of blocks
    const dim3 block(BLOCK_WIDTH, BLOCK_WIDTH, 1);  

    gaussian_blur<<<grid,block>>>(d_blur_img,d_img,1050.0,1580.0,dev_p,9.0);

    cudaDeviceSynchronize();

    /* after bluring image i will copied buffer from Device to Host and free gpu memory */
    cudaFree(d_img);
    cudaFree(d_blur_img);
    cudaFree(dev_p);


return 0;
}

Answer 1

简短回答：两种算法良好与图像模糊相关，因此请随意为您选择最好（最快）的算法用例。

内核大小和 sigma值直接相关：sigma越大，内核越大（因此获得最终结果的每像素操作越多））。如果你实现了一个天真的卷积，那么你应该尝试一个可分离的卷积实现;它会将计算时间缩短一个数量级。

现在更多的洞察力：他们实现几乎相同的高斯模糊操作。为什么差不多？这是因为对图像进行FFT会隐式地将其周期化。因此，在图像的边界处，卷积核看到缠绕在其边缘的图像。这称为循环卷积（因为包装）。另一方面，高斯模糊实现了一个简单的线性卷积。

对于sigma = 20，哪个是最好的简单高斯模糊或高斯模糊FFT？

1 个答案: