我为Matlab写了一个cuda函数,用cublasDgetrfBatched()对一批矩阵进行LU分解。该函数的工具包文档是here。
适用于尺寸为32x32的矩阵。但是对于更大的矩阵,它失败了状态代码CUBLAS_STATUS_INVALID_VALUE。以下是我的源代码(gpuBatchedLU.cu):
#include "mex.h"
#include "gpu/mxGPUArray.h"
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <string>
#include <sstream>
static std::string cublasGetErrorString(cublasStatus_t error) {
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
inline bool cublasAssert(cublasStatus_t code, const char* file, int line) {
if (code != CUBLAS_STATUS_SUCCESS) {
std::stringstream ss;
ss << "cublasAssert: " << cublasGetErrorString(code) << " in "
<< std::string(file) << ", line " << line << ".";
mexErrMsgTxt(ss.str().c_str());
}
return code == CUBLAS_STATUS_SUCCESS;
}
inline bool cudaAssert(cudaError_t code, const char* file, int line) {
if (code != cudaSuccess) {
std::stringstream ss;
ss << "cudaAssert: " << cudaGetErrorString(code) << " in "
<< std::string(file) << ", line " << line << ".";
mexErrMsgTxt(ss.str().c_str());
}
return code == cudaSuccess;
}
inline bool mexGPUAssert(int code, const char* file, int line) {
if (code != MX_GPU_SUCCESS) {
std::stringstream ss;
ss << "mexGPUAssert: could not initialize the Mathworks GPU API in "
<< std::string(file) << ", line " << line << ".";
mexErrMsgTxt(ss.str().c_str());
}
return code == MX_GPU_SUCCESS;
}
#define cublasErrchk(ans) { cublasAssert((ans), __FILE__, __LINE__); }
#define cudaErrchk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
#define mxGPUErrchk(ans) { mexGPUAssert((ans), __FILE__, __LINE__); }
void mexFunction(int nlhs, mxArray *plhs[], /* Output variables */int nrhs,
const mxArray *prhs[]) /* Input variables */{
if (nrhs != 1) { /* end if not one function arguments */
mexErrMsgTxt("This function requires one input argument.");
return;
}
if (nlhs > 3) { /* take three outputs */
mexErrMsgTxt("This function takes a maximum of three output variables.");
return;
}
mxGPUErrchk(mxInitGPU());
const mxGPUArray* in1_gpu = mxGPUCreateFromMxArray(prhs[0]);
size_t ndims = mxGPUGetNumberOfDimensions(in1_gpu);
const size_t* dim = (const size_t*) mxGPUGetDimensions(in1_gpu);
if (ndims != 3) { /* end if input arguments are of different dimensions */
mexErrMsgTxt("The input argument must be a 3-dimensional array.");
return;
}
cublasHandle_t handle;
cublasErrchk(cublasCreate(&handle));
int no_matrices = dim[2];
int nrow = dim[0];
int ncol = dim[1];
int matrix_size = nrow * ncol;
size_t i;
std::stringstream ss;
ss << "dim[2] = " << dim[2] << "\nno_matrices = " << no_matrices << "\nnrow = " << nrow << "\nmatrix_size = " << nrow << " x " << ncol << " = " << matrix_size << std::endl;
mexPrintf(ss.str().c_str());
mxGPUArray* gpu_array_inout = mxGPUCopyFromMxArray(prhs[0]);
double* inout_storage = (double*) mxGPUGetData(gpu_array_inout);
size_t info_dimensions[1] = { no_matrices };
mxGPUArray* gpu_array_info = mxGPUCreateGPUArray(1, (mwSize*) info_dimensions, mxINT32_CLASS, mxREAL,
MX_GPU_INITIALIZE_VALUES);
int* out_info = (int*) mxGPUGetData(gpu_array_info);
mexPrintf("after defining gpu_array_info\n");
size_t pivot_dimensions[2] = { nrow, no_matrices };
mxGPUArray* gpu_array_pivot = mxGPUCreateGPUArray(2, (mwSize*) pivot_dimensions, mxINT32_CLASS, mxREAL,
MX_GPU_DO_NOT_INITIALIZE);
int* out_pivot = (int*) mxGPUGetData(gpu_array_pivot);
mexPrintf("after defining gpu_array_pivot\n");
double** inout_pointers_CPU = (double**) malloc(no_matrices * sizeof(double*));
for (i = 0; i < no_matrices; i++) {
inout_pointers_CPU[i] = (double*) ((char*) inout_storage + i * ((size_t) matrix_size) * sizeof(double));
}
double** inout_pointers_GPU;
cudaErrchk(cudaMalloc((void** )&inout_pointers_GPU, no_matrices * sizeof(double*)));
cudaErrchk(
cudaMemcpy(inout_pointers_GPU, inout_pointers_CPU, no_matrices * sizeof(double*), cudaMemcpyHostToDevice));
free(inout_pointers_CPU);
ss.clear();
ss << "check again before calling cublasDgetrfBatched:\nnrow = " << nrow << "\nno_matrices = " << no_matrices << std::endl;
mexPrintf(ss.str().c_str());
cublasErrchk(cublasDgetrfBatched(handle, nrow, inout_pointers_GPU, nrow, out_pivot, out_info, no_matrices));
cublasErrchk(cublasDestroy(handle));
cudaErrchk(cudaFree(inout_pointers_GPU));
if (mxIsGPUArray(prhs[0])) {
plhs[0] = mxGPUCreateMxArrayOnGPU(gpu_array_inout);
if (nlhs > 1) {
plhs[1] = mxGPUCreateMxArrayOnGPU(gpu_array_pivot);
if (nlhs > 2) {
plhs[2] = mxGPUCreateMxArrayOnGPU(gpu_array_info);
}
}
} else {
plhs[0] = mxGPUCreateMxArrayOnCPU(gpu_array_inout);
if (nlhs > 1) {
plhs[1] = mxGPUCreateMxArrayOnCPU(gpu_array_pivot);
if (nlhs > 2) {
plhs[2] = mxGPUCreateMxArrayOnCPU(gpu_array_info);
}
}
}
mxGPUDestroyGPUArray(gpu_array_inout);
mxGPUDestroyGPUArray(gpu_array_pivot);
mxGPUDestroyGPUArray(gpu_array_info);
mxFree((void*) dim);
return;
}
我编译如下:
mex -L/usr/local/cuda/lib64 -lcudart -lcublas gpuBatchedLU.cu
我从MATLAB打来电话:
[a1,b1,c1]=gpuBatchedLU(randn(32,32,5)); %no problem
[a2,b2,c2]=gpuBatchedLU(randn(33,33,5)); %produces CUBLAS_STATUS_INVALID_VALUE
我使用Matlab R2013b与并行工具箱,Cuda 5.5和NVS 5200M图形芯片。
任何人都可以复制这个问题吗?我很感激有关如何解决这个问题的任何建议。
答案 0 :(得分:1)
问题似乎是在5.0版本中使用libcublas.so
的Matlab R2013b。文件链接位于/MATLAB/R2013b/bin/glnxa64/
。一旦我将链接更改为我的Cuda 5.5安装的libcublas.so
,它就可以正常工作。