我在Visual Studio 2010中构建了一个项目,该项目使用mexfunction和一个在Cuda中调用内核函数的包装函数。我的问题是,当我尝试读取传递给包装函数的数据时,程序崩溃了。我在下面粘贴了一些代码,并在发生问题的确切位置放了一些注释。
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
// input validation
if (nrhs != 2 || nlhs > 1) {
mexErrMsgTxt("Wrong number of input/output arguments.");
}
if (!mxIsSingle(prhs[0]) || !mxIsSingle(prhs[1])) {
mexErrMsgTxt("Inputs must be single arrays.");
}
if (mxIsComplex(prhs[0]) || mxIsComplex(prhs[1])) {
mexErrMsgTxt("Inputs must be real arrays.");
}
if (mxIsSparse(prhs[0]) || mxIsSparse(prhs[1])) {
mexErrMsgTxt("Inputs must be dense arrays.");
}
if (mxGetNumberOfElements(prhs[0]) != mxGetNumberOfElements(prhs[1])) {
mexErrMsgTxt("Inputs must have the same size.");
}
// create ouput array
mwSize numel = mxGetNumberOfElements(prhs[0]);
mwSize ndims = mxGetNumberOfDimensions(prhs[0]);
const mwSize *dims = mxGetDimensions(prhs[0]);
int rows = mxGetM(prhs[0]); /* Get the dimensions of A */
int cols = mxGetN(prhs[0]);
//plhs[0] = mxCreateNumericArray(1, dims, mxSINGLE_CLASS, mxREAL);
//plhs[0] = mxCreateDoubleMatrix(rows,1,mxREAL);
// Create a rows-by-3 real float
plhs[0] = mxCreateNumericMatrix(rows, 1, mxSINGLE_CLASS, mxREAL);
// get pointers to data
float *h_c = (float*) mxGetData(plhs[0]);
float *h_a = (float*) mxGetData(prhs[0]);
float *h_b = (float*) mxGetData(prhs[1]);
myGPU::cudaFunction_wrapper(h_a, h_b, h_c, rows, cols);
在 .cu 文件中,存在以下代码。
namespace myGPU
{//begin namespace
extern "C++" void cudaFunction_wrapper( float* h_A, float* h_B, float* h_C, int rows, int cols );
__global__ void cudaFunction( float* A, float* B, float* C, int rows, int cols )
{
int j = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y;
int m,n;
for(m = 0; m < rows; m++)
for(n = 0; n < cols; n++)
C[m] = A[m + rows*n];
}
void cudaFunction_wrapper( float* h_A, float* h_B, float* h_C, int rows, int cols )
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
int numElements = rows * cols;
size_t size = numElements * sizeof(float);
// Allocate the device input matrix B
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device output matrix C
float *d_C = NULL;
//the returnen value is a vector
err = cudaMalloc((void **)&d_C, rows * sizeof(float) );
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
/*
/////////////////////////////////////////////////////
///////////// Works WHEN ACCESS h_A /////////////////
/////////////////////////////////////////////////////
int m,n;
for(m = 0; m < rows; m++)
for(n = 0; n < cols; n++)
mexPrintf("%f \n", h_A[m + rows*n]) ;
/////////////////////////////////////////////////////
///////////// IT CRASHES HERE WHEN ACCESS d_B ///////
/////////////////////////////////////////////////////
for(m = 0; m < rows; m++)
for(n = 0; n < cols; n++)
mexPrintf("%f \n", d_B[m + rows*n]) ;
*/
cudaFunction<<<blocksPerGrid, threadsPerBlock>>>( d_A, d_B, d_C, rows, cols );
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, rows * sizeof(float) , cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free device global memory
err = cudaFree(d_A);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_B);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_C);
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Reset the device and exit
err = cudaDeviceReset();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
}//end namespace
另外我想问的第二个问题是如何通过附加在 Nsight 中调试VS中的进程。我按照http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Attach_CUDA_to_Process.htm中的说明进行操作,但无法启用附加按钮。顺便说一句,Matlab是我想要附加的程序。
提前谢谢。
PS :Win 7 84x,CUDA SDK 5.5,Visual Studio 2010,Matlab 2011a
答案 0 :(得分:2)
对于d_C
,您是否应该使用cudaMalloc((void **)&d_C, rows*sizeof(float));
代替cudaMalloc((void **)&d_C, rows);
?
关于mexPrintf("%f \n", d_B[m + rows*n])
上的崩溃问题,d_B
设备(GPU)内存是否存在问题?
此外,Praetorian所说的plhs[0]
创建对于h_c
所需的缓冲区来说太小了。