我正在尝试在cuda中实现Sauvola二值化。为此我已经在主机中的二维数组中读取图像并使用pitch在设备中为二维数组分配内存。在分配内存之后我试图复制主机二维数组使用cudaMemcpy2D设备2d数组,它编译得很好,但它在运行时崩溃。我无法理解我在哪里丢失,请提出一些建议。我写的代码如下:
#include "BinMain.h"
#include "Binarization.h"
#include <stdlib.h>
#include <stdio.h>
#include <conio.h>
#include <cuda.h>
#include <cuda_runtime.h>
void printDevProp(cudaDeviceProp);
void CUDA_SAFE_CALL( cudaError_t);
int main()
{
//Read an IplImage in imgOriginal as grayscale
IplImage * imgOriginal = cvLoadImage("E:\\1.tiff",CV_LOAD_IMAGE_GRAYSCALE);
//Create a size variable of type CvSize for cvCreateImage Parameter
CvSize size = cvSize(imgOriginal->width,imgOriginal->height);
//create an image for storing the result image with same height and width as imgOriginal
IplImage * imgResult = cvCreateImage(size,imgOriginal->depth,imgOriginal- >nChannels);
//Create a 2D array for storing the pixels value of each of the pixel of imgOriginal grayscale image
int ** arrOriginal = (int **)malloc(imgOriginal->height * sizeof(int *));
for (int i = 0; i < imgOriginal->height; i++)
{
arrOriginal[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}
//Create a 2D array for storing the returned device array
int ** arrReturn = (int **)malloc(imgOriginal->height * sizeof(int *));
for (int i = 0; i < imgOriginal->height; i++)
{
arrReturn[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}
//Create a CvScalar variable to copy pixel values in 2D array (arrOriginal)
CvScalar s;
//Copying the pixl values
for(int j = 0;j<imgOriginal->height;j++)
{
for(int k =0;k<imgOriginal->width;k++)
{
s = cvGet2D(imgOriginal,j,k);
arrOriginal[j][k] = s.val[0];
}
}
//Cuda Device Property
int devCount;
cudaGetDeviceCount(&devCount);
printf("CUDA Device Query...\n");
printf("There are %d CUDA devices.\n", devCount);
// Iterate through devices
for (int i = 0; i < devCount; ++i)
{
// Get device properties
printf("\nCUDA Device #%d\n", i);
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, i);
printDevProp(devProp);
}
//Start the clock
clock_t start = clock();
//Allocating Device memory for 2D array using pitch
size_t host_orig_pitch = imgOriginal->width * sizeof(int)* imgOriginal->height; //host original array pitch in bytes
size_t dev_pitch; //device array pitch in bytes which will be used in cudaMallocPitch
size_t dev_pitchReturn; //device return array pitch in bytes
size_t host_ret_pitch = imgOriginal->width * sizeof(int)* imgOriginal->height; //host return array pitch in bytes
int * devArrOriginal; //device 2d array of original image
int * result; //device 2d array for returned array
int dynmicRange = 128; //Dynamic Range for calculating the threshold from sauvola's formula
//Allocating memory by using cudaMallocPitch
CUDA_SAFE_CALL(cudaMallocPitch((void**)&devArrOriginal,&dev_pitch,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int)));
//Allocating memory for returned array
CUDA_SAFE_CALL(cudaMallocPitch((void**)&result,&dev_pitchReturn,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int)));
//Copying 2D array from host memory to device mempry by using cudaMemCpy2D
CUDA_SAFE_CALL(cudaMemcpy2D((void*)devArrOriginal,dev_pitch,(void*)arrOriginal,host_orig_pitch,imgOriginal->width * sizeof(float),imgOriginal->height,cudaMemcpyHostToDevice));
int windowSize = 19; //Size of the window for calculating mean and variance
//Launching the kernel by calling myKernelLauncher function.
myKernelLauncher(devArrOriginal,result,windowSize,imgOriginal->width,imgOriginal- >height,dev_pitch,dynmicRange);
//Calling the sauvola binarization function by passing the parameters as
//1.arrOriginal 2D array 2.Original image height 3.Original image width
//int ** result = AdaptiveBinarization(arrOriginal,imgOriginal->height,imgOriginal- >width);//binarization(arrOriginal,imgOriginal->width,imgOriginal->height);
//
CUDA_SAFE_CALL(cudaMemcpy2D(arrReturn,host_ret_pitch,result,dev_pitchReturn,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int),cudaMemcpyDeviceToHost));
//create a CvScalar variable to set the data in imgResult
CvScalar ss;
//Copy the pixel values from returned array to imgResult
for(int i=0;i<imgOriginal->height;i++)
{
for(int j=0;j<imgOriginal->width;j++)
{
ss = cvScalar(arrReturn[i][j]*255);
cvSet2D(imgResult,i,j,ss);
//k++; //No need for k if returned array is 2D
}
}
printf("Done \n");
//calculate and print the time elapsed
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
//Create a windoe and show the resule image
cvNamedWindow("Result",CV_WINDOW_AUTOSIZE);
cvShowImage("Result",imgResult);
cvWaitKey(0);
getch();
//Release the various resources
cvReleaseImage(&imgResult);
cvReleaseImage(&imgOriginal);
cvDestroyWindow("Result");
for(int i = 0; i < imgOriginal->height; i++)
free(arrOriginal[i]);
free(arrOriginal);
free(result);
cudaFree(&devArrOriginal);
cudaFree(&result);
}
// Print device properties
void printDevProp(cudaDeviceProp devProp)
{
printf("Major revision number: %d\n", devProp.major);
printf("Minor revision number: %d\n", devProp.minor);
printf("Name: %s\n", devProp.name);
printf("Total global memory: %u\n", devProp.totalGlobalMem);
printf("Total shared memory per block: %u\n", devProp.sharedMemPerBlock);
printf("Total registers per block: %d\n", devProp.regsPerBlock);
printf("Warp size: %d\n", devProp.warpSize);
printf("Maximum memory pitch: %u\n", devProp.memPitch);
printf("Maximum threads per block: %d\n", devProp.maxThreadsPerBlock);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of block: %d\n", i, devProp.maxThreadsDim[i]);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of grid: %d\n", i, devProp.maxGridSize[i]);
printf("Clock rate: %d\n", devProp.clockRate);
printf("Total constant memory: %u\n", devProp.totalConstMem);
printf("Texture alignment: %u\n", devProp.textureAlignment);
printf("Concurrent copy and execution: %s\n", (devProp.deviceOverlap ? "Yes" : "No"));
printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
printf("Kernel execution timeout: %s\n", (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
return;
}
/* Utility Macro : CUDA SAFE CALL */
void CUDA_SAFE_CALL( cudaError_t call)
{
cudaError_t ret = call;
switch(ret)
{
case cudaSuccess:
break;
default :
{
printf(" ERROR at line :%i.%d' ' %s\n",
__LINE__,ret,cudaGetErrorString(ret));
exit(-1);
break;
}
}
}
代码流程如下: 1.在图像主机中创建一个2D数组,从内核返回另一个数组。 2.使用CudaMallocPitch在设备中为2D阵列分配内存 3.为内核返回的二维数组分配内存。 4.使用cudaMemcpy2d将原始2d阵列从主机复制到设备阵列。 5.启动内核。 6.使用cudaMemcpy2D将返回的设备阵列复制到主机阵列。
程序在达到第4点时崩溃。这是一个未处理的异常,表示“在SauvolaBinarization_CUDA_OpenCV.exe中0x773415de处的未处理异常:0xC0000005:访问冲突读取位置0x01611778。”
我认为问题必须是在分配内存时,但我是第一次使用该功能,并且不知道它是如何工作的,请建议。
答案 0 :(得分:2)
首先,你没有正确地调用“cudaMallocPitch”。 “height”参数应表示行数,因此不是:
imgOriginal->height * sizeof(int)
你应该简单地使用:
imgOriginal->height
这很好,因为每行的字节数已经包含在“pitch”属性中。然而,主要问题在于为主机映像分配内存的方式。当你写:
//Create a 2D array for storing the pixels value of each of the pixel of imgOriginal grayscale image
int ** arrOriginal = (int **)malloc(imgOriginal->height * sizeof(int *));
for (int i = 0; i < imgOriginal->height; i++)
{
arrOriginal[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}
您正在有效地创建一个包含指向数组的指针的数组。 CUDA API调用你 正在制作:
CUDA_SAFE_CALL(cudaMemcpy2D((void*)devArrOriginal,dev_pitch,(void*)arrOriginal,host_orig_pitch,imgOriginal->width * sizeof(float),imgOriginal->height,cudaMemcpyHostToDevice));
期望输入内存缓冲区是连续的。所以这就是将要发生的事情:输入图像的第一行(总计“imgOriginal-&gt; width * sizeof(float)”字节)将从地址开始读取:
(void*)arrOriginal
但是,从该地址开始的有效数据量仅为“imgOriginal-&gt; height * sizeof(int *)”字节。两个字节计数很可能不同,这将导致崩溃,因为您最终将从未知位置读取。
要解决此问题,请考虑将“arrOriginal”分配为一个连续的块,例如:
int * arrOriginal = (int *)malloc(imgOriginal->height * imgOriginal->width * sizeof(int));
此外,在这种情况下,你的音调应该是:
"imgOriginal->width * sizeof(int)"