我正在用CUDA编写卷积并使用OpenCV加载图像。但是我没有得到想要的卷积结果。
我将Mat更改为ptr,并访问CUDA内核中的像素值。我找不到错误。
#include<iostream>
#include<cstdio>
#include<opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>
#include<cuda_runtime.h>
#include <device_launch_parameters.h>
#define _USE_MATH_DEFINES // for C++
#include <math.h>
using namespace std;
//typedef
typedef std::vector<float> Row; // One row of the matrix
typedef std::vector<Row> Matrix; // Matrix: a vector of rows
//function indentifier
Matrix createGk(float sig);
static inline void _safe_cuda_call(cudaError err, const char* msg, const char* file_name, const int line_number)
{
if (err != cudaSuccess)
{
fprintf(stderr, "%s\n\nFile: %s\n\nLine Number: %d\n\nReason: %s\n", msg, file_name, line_number, cudaGetErrorString(err));
std::cin.get();
exit(EXIT_FAILURE);
}
}
#define SAFE_CALL(call,msg) _safe_cuda_call((call),(msg),__FILE__,__LINE__)
下面是我的CUDA内核。
__global__ void image_filter(float* input, int input_width, int input_height,
float* kernel, int half,
float* output, int output_width, int output_height)
{
//Set the padding size and filter size
unsigned int paddingSize = half;
unsigned int filterSize = 2 * half + 1;
//Set the pixel coordinate
const unsigned int j = blockIdx.x * blockDim.x + threadIdx.x + paddingSize; // paddingSize add sayar ma lo bu htin tl
const unsigned int i = blockIdx.y * blockDim.y + threadIdx.y + paddingSize;
//The multiply-add operation for the pixel coordinate (j, i)
if (j >= paddingSize && j < input_width - paddingSize && i >= paddingSize && i < input_height - paddingSize) {
unsigned int oPixelPos = (i - paddingSize) * output_width + (j - paddingSize);
output[oPixelPos] = 0.0;
for (int k = -half; k <= half; k++) {
for (int l = -half; l <= half; l++) {
unsigned int iPixelPos = (i + k) * input_width + (j + 1);
unsigned int coefPos = (k + half) * filterSize + (l + half);
output[oPixelPos] += input[iPixelPos] * kernel[coefPos];
}
}
}
}
这是我用于CUDA内核的包装函数。
void cuda_convolution(const cv::Mat& input,const cv::Mat& kernel, cv::Mat& output)
{
//Calculate the total number of bytes of input and output images
const int inBytes = input.step * input.rows;
const int outBytes = output.step * output.rows;
const int kernelBytes = kernel.step * kernel.rows;
const int half = 45;
float *d_input, *d_output, *d_kernel;
//Allocate device memory
SAFE_CALL(cudaMalloc<float>(&d_input, inBytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc<float>(&d_output, outBytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc<float>(&d_kernel, kernelBytes), "CUDA Malloc Failed");
//Copy data from OpenCV input image to device memory
SAFE_CALL(cudaMemcpy(d_input, input.ptr(), inBytes, cudaMemcpyHostToDevice), "CUDA Memcpy Host to Device Failed");
SAFE_CALL(cudaMemcpy(d_input, kernel.ptr(), kernelBytes, cudaMemcpyHostToDevice), "CUDA Memcpy Host to Device Failed");
//Specify a reasonable block size
const dim3 block(32, 32);
//Calculate the grid size to cover the whole image
const dim3 grid((input.cols + block.x - 1) / block.x, (input.rows + block.y - 1) / block.y);
image_filter << <grid, block >> > (d_input, input.cols, input.rows,
d_kernel, half,
d_output, output.cols, output.rows);
//Synchronize to check for any kernel launch errors
SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed");
//Copy back data from the device to OpenCV output image
SAFE_CALL(cudaMemcpy(output.ptr(), d_output, outBytes, cudaMemcpyDeviceToHost), "CUDA Memcpy Host To Device Failed");
//Free the device memory
SAFE_CALL(cudaFree(d_input), "CUDA Free Failed");
SAFE_CALL(cudaFree(d_output), "CUDA Free Failed");
}
最后确定主要功能。
int main(int argc, char** argv)
{
//std::string imagePath = "C:/Users/Kyi Myo Zaw/source/repos/cuda_test1/x64/Debug/image.jpg";
//std::string imagePath = argv[1];//Not working with or without ""
//Read input image from the disk
cv::Mat input = cv::imread(argv[1], CV_LOAD_IMAGE_UNCHANGED);
if (input.empty())
{
std::cout << "Image Not Found!" << std::endl;
std::cin.get();
return -1;
}
cv::Mat I_float;
input.convertTo(I_float, CV_32FC3);
std::vector<cv::Mat> ch;
split(I_float, ch);
cv::Mat Ib_float, Ig_float, Ir_float;
Ib_float = ch[0];
Ig_float = ch[1];
Ir_float = ch[2];
cv::Mat ouput(I_float.rows, I_float.cols, CV_32FC1);
Matrix gk1 = createGk(15);
cv::Mat gk1Mat((int)gk1.size(), (int)gk1[0].size(), CV_32F);
for (int i = 0; i < gk1.size(); i++)
{
for (int j = 0; j < gk1[i].size(); j++)
{
gk1Mat.at<float>(i, j) = gk1[i][j];
}
}
int top = 45;
int bottom = 45;
int left = 45;
int right = 45;
cv::Mat input_wr
cv::copyMakeBorder(Ib_float, input_wr, top, bottom, left, right, CV_HAL_BORDER_CONSTANT, cv::Scalar(0, 0, 0));
//Call the wrapper function
cuda_convolution(input_wr, gk1Mat, ouput);
//Call the wrapper function
//Show the input and output
cv::imshow("Input", input_wr);
cv::imshow("Output", ouput);
//Wait for key press
cv::waitKey();
return 0;
}
Matrix createGk(float sig)
{
float r = 0.0;
float sum = 0.0;
int f_size, half; // h == kernel half size, f_size == kernel size
f_size = (int)ceil(sig * 6) + 1;
cout << "f_size: " << f_size << endl;
vector< vector<float> > gk;
half = f_size / 2;
for (int i = -half; i <= half; i++)
{
vector<float> row;
for (int j = -half; j <= half; j++)
{
float value = 0.0;
r = (i*i + j * j);
value = exp((-(r)) / sig * sig);
row.push_back(value);
sum += value;
}
gk.push_back(row);
}
//For Normalization
for (int i = 0; i < f_size; i++)
{
for (int j = 0; j < f_size; j++)
{
gk[i][j] /= sum;
}
}
return gk;
}
想要获得卷积图像。但是输出的图像不正确。