Question

我正在用CUDA编写卷积并使用OpenCV加载图像。但是我没有得到想要的卷积结果。

我将Mat更改为ptr，并访问CUDA内核中的像素值。我找不到错误。

#include<iostream>
#include<cstdio>
#include<opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>
#include<cuda_runtime.h>
#include <device_launch_parameters.h>

#define _USE_MATH_DEFINES // for C++
#include <math.h>

using namespace std;

//typedef
typedef std::vector<float> Row; // One row of the matrix
typedef std::vector<Row> Matrix; // Matrix: a vector of rows

//function indentifier
Matrix createGk(float sig);

static inline void _safe_cuda_call(cudaError err, const char* msg, const char* file_name, const int line_number)
{
    if (err != cudaSuccess)
    {
        fprintf(stderr, "%s\n\nFile: %s\n\nLine Number: %d\n\nReason: %s\n", msg, file_name, line_number, cudaGetErrorString(err));
        std::cin.get();
        exit(EXIT_FAILURE);
    }
}

#define SAFE_CALL(call,msg) _safe_cuda_call((call),(msg),__FILE__,__LINE__)

下面是我的CUDA内核。

__global__ void image_filter(float* input, int input_width, int input_height,
    float* kernel, int half,
    float* output, int output_width, int output_height)
{
    //Set the padding size and filter size
    unsigned int paddingSize = half;
    unsigned int filterSize = 2 * half + 1;

    //Set the pixel coordinate
    const unsigned int j = blockIdx.x * blockDim.x + threadIdx.x + paddingSize; // paddingSize add sayar ma lo bu htin tl
    const unsigned int i = blockIdx.y * blockDim.y + threadIdx.y + paddingSize;

    //The multiply-add operation for the pixel coordinate (j, i)
    if (j >= paddingSize && j < input_width - paddingSize && i >= paddingSize && i < input_height - paddingSize) {

        unsigned int oPixelPos = (i - paddingSize) * output_width + (j - paddingSize);
        output[oPixelPos] = 0.0;
        for (int k = -half; k <= half; k++) {
            for (int l = -half; l <= half; l++) {
                unsigned int iPixelPos = (i + k) * input_width + (j + 1);
                unsigned int coefPos = (k + half) * filterSize + (l + half);
                output[oPixelPos] += input[iPixelPos] * kernel[coefPos];
            }
        }
    }
}

这是我用于CUDA内核的包装函数。

  void cuda_convolution(const cv::Mat& input,const cv::Mat& kernel, cv::Mat& output)
{
    //Calculate the total number of bytes of input and output images
    const int inBytes = input.step * input.rows;
    const int outBytes = output.step * output.rows;
    const int kernelBytes = kernel.step * kernel.rows;

    const int half = 45;
    float *d_input, *d_output, *d_kernel;

    //Allocate device memory
    SAFE_CALL(cudaMalloc<float>(&d_input, inBytes), "CUDA Malloc Failed"); 
    SAFE_CALL(cudaMalloc<float>(&d_output, outBytes), "CUDA Malloc Failed");
    SAFE_CALL(cudaMalloc<float>(&d_kernel, kernelBytes), "CUDA Malloc Failed");

    //Copy data from OpenCV input image to device memory
    SAFE_CALL(cudaMemcpy(d_input, input.ptr(), inBytes, cudaMemcpyHostToDevice), "CUDA Memcpy Host to Device Failed");
    SAFE_CALL(cudaMemcpy(d_input, kernel.ptr(), kernelBytes, cudaMemcpyHostToDevice), "CUDA Memcpy Host to Device Failed");

    //Specify a reasonable block size
    const dim3 block(32, 32);

    //Calculate the grid size to cover the whole image
    const dim3 grid((input.cols + block.x - 1) / block.x, (input.rows + block.y - 1) / block.y);

    image_filter << <grid, block >> > (d_input, input.cols, input.rows,
                                        d_kernel, half,
                                        d_output, output.cols, output.rows);

    //Synchronize to check for any kernel launch errors
    SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed");

    //Copy back data from the device to OpenCV output image
    SAFE_CALL(cudaMemcpy(output.ptr(), d_output, outBytes, cudaMemcpyDeviceToHost), "CUDA Memcpy Host To Device Failed");

    //Free the device memory
    SAFE_CALL(cudaFree(d_input), "CUDA Free Failed");
    SAFE_CALL(cudaFree(d_output), "CUDA Free Failed");
}

最后确定主要功能。

       int main(int argc, char** argv)
    {
        //std::string imagePath = "C:/Users/Kyi Myo Zaw/source/repos/cuda_test1/x64/Debug/image.jpg";
        //std::string imagePath = argv[1];//Not working with or without ""

        //Read input image from the disk
        cv::Mat input = cv::imread(argv[1], CV_LOAD_IMAGE_UNCHANGED);

        if (input.empty())
        {
            std::cout << "Image Not Found!" << std::endl;
            std::cin.get();
            return -1;
        }

        cv::Mat I_float;
        input.convertTo(I_float, CV_32FC3);
        std::vector<cv::Mat> ch;
        split(I_float, ch);

        cv::Mat Ib_float, Ig_float, Ir_float;
        Ib_float = ch[0];
        Ig_float = ch[1];
        Ir_float = ch[2];

        cv::Mat ouput(I_float.rows, I_float.cols, CV_32FC1);

        Matrix gk1 = createGk(15);
        cv::Mat gk1Mat((int)gk1.size(), (int)gk1[0].size(), CV_32F);
        for (int i = 0; i < gk1.size(); i++)
        {
            for (int j = 0; j < gk1[i].size(); j++)
            {
                gk1Mat.at<float>(i, j) = gk1[i][j];
            }
        }

            int top = 45;
        int bottom = 45;
        int left = 45;
        int right = 45;
            cv::Mat input_wr
        cv::copyMakeBorder(Ib_float, input_wr, top, bottom, left, right, CV_HAL_BORDER_CONSTANT, cv::Scalar(0, 0, 0));
        //Call the wrapper function
        cuda_convolution(input_wr, gk1Mat, ouput);



        //Call the wrapper function


        //Show the input and output
        cv::imshow("Input", input_wr);
        cv::imshow("Output", ouput);

        //Wait for key press
        cv::waitKey();

        return 0;
    }

   Matrix createGk(float sig)
    {

        float r = 0.0;
        float sum = 0.0;
        int f_size, half; // h == kernel half size, f_size == kernel size

        f_size = (int)ceil(sig * 6) + 1;
        cout << "f_size: " << f_size << endl;

        vector< vector<float> > gk;
        half = f_size / 2;


        for (int i = -half; i <= half; i++)
        {
            vector<float> row;
            for (int j = -half; j <= half; j++)
            {
                float value = 0.0;
                r = (i*i + j * j);
                value = exp((-(r)) / sig * sig);
                row.push_back(value);
                sum += value;
            }
            gk.push_back(row);
        }

        //For Normalization
        for (int i = 0; i < f_size; i++)
        {
            for (int j = 0; j < f_size; j++)
            {
                gk[i][j] /= sum;
            }

        }


        return gk;
    }

想要获得卷积图像。但是输出的图像不正确。

我的CUDA卷积无法产生预期的结果

0 个答案: