我开始使用cuda实现一些简单的图像处理,但我的代码中有错误 将像素从设备复制到主机
时会发生错误这是我的尝试
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <opencv2\core\core.hpp>
#include <opencv2\highgui\highgui.hpp>
#include <stdio.h>
using namespace cv;
unsigned char *h_pixels;
unsigned char *d_pixels;
int bufferSize;
int width,height;
const int BLOCK_SIZE = 32;
Mat image;
void get_pixels(const char* fileName)
{
image = imread(fileName);
bufferSize = image.size().width * image.size().height * 3 * sizeof(unsigned char);
width = image.size().width;
height = image.size().height;
h_pixels = new unsigned char[bufferSize];
memcpy(h_pixels,image.data,bufferSize);
}
__global__ void invert_image(unsigned char* pixels,int width,int height)
{
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int cidx = (row * width + col) * 3;
pixels[cidx] = 255 - pixels[cidx];
pixels[cidx + 1] = 255 - pixels[cidx + 1];
pixels[cidx + 2] = 255 - pixels[cidx + 2];
}
int main()
{
get_pixels("D:\\photos\\z.jpg");
cudaError_t err = cudaMalloc((void**)&d_pixels,bufferSize);
err = cudaMemcpy(d_pixels,h_pixels,bufferSize,cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(width/dimBlock.x,height/dimBlock.y);
invert_image<<<dimBlock,dimGrid>>>(d_pixels,width,height);
unsigned char *pixels = new unsigned char[bufferSize];
err= cudaMemcpy(pixels,d_pixels,bufferSize,cudaMemcpyDeviceToHost);// unknown error
const char * errStr = cudaGetErrorString(err);
cudaFree(d_pixels);
image.data = pixels;
namedWindow("display image");
imshow("display image",image);
waitKey();
return 0;
}
我怎样才能找出cuda设备中发生的错误 谢谢你的帮助
答案 0 :(得分:2)
答案 1 :(得分:2)
OpenCV图像不连续。每行是4字节或8字节对齐。您还应该将Mat的step
字段传递给CUDA内核,以便正确计算cidx
。计算输出索引的通用公式为:
cidx = row * (step/elementSize) + (NumberOfChannels * col);
在你的情况下,它将是:
cidx = row * step + (3 * col);
参考图像的对齐方式,缓冲区大小等于image.step * image.size().height
。
接下来是@phoad在第三点指出的那个。您应该创建足够数量的线程块来覆盖整个图像。
以下是Grid的通用公式,可为任何图像大小创建足够数量的块。
dim3 block(BLOCK_SIZE,BLOCK_SIZE);
dim3 grid((width + block.x - 1)/block.x,(height + block.y - 1)/block.y);
答案 2 :(得分:0)
在内核调用后立即使用此命令来打印内核错误:
printf("error code: %s\n",cudaGetErrorString(cudaGetLastError()))