Question

我为Matlab写了一个cuda函数，用cublasDgetrfBatched（）对一批矩阵进行LU分解。该函数的工具包文档是here。

适用于尺寸为32x32的矩阵。但是对于更大的矩阵，它失败了状态代码CUBLAS_STATUS_INVALID_VALUE。以下是我的源代码（gpuBatchedLU.cu）：

#include "mex.h"
#include "gpu/mxGPUArray.h"

/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>

#include <string>
#include <sstream>

static std::string cublasGetErrorString(cublasStatus_t error) {
 switch (error) {
 case CUBLAS_STATUS_SUCCESS:
  return "CUBLAS_STATUS_SUCCESS";

 case CUBLAS_STATUS_NOT_INITIALIZED:
  return "CUBLAS_STATUS_NOT_INITIALIZED";

 case CUBLAS_STATUS_ALLOC_FAILED:
  return "CUBLAS_STATUS_ALLOC_FAILED";

 case CUBLAS_STATUS_INVALID_VALUE:
  return "CUBLAS_STATUS_INVALID_VALUE";

 case CUBLAS_STATUS_ARCH_MISMATCH:
  return "CUBLAS_STATUS_ARCH_MISMATCH";

 case CUBLAS_STATUS_MAPPING_ERROR:
  return "CUBLAS_STATUS_MAPPING_ERROR";

 case CUBLAS_STATUS_EXECUTION_FAILED:
  return "CUBLAS_STATUS_EXECUTION_FAILED";

 case CUBLAS_STATUS_INTERNAL_ERROR:
  return "CUBLAS_STATUS_INTERNAL_ERROR";
 }

 return "<unknown>";
}

inline bool cublasAssert(cublasStatus_t code, const char* file, int line) {
 if (code != CUBLAS_STATUS_SUCCESS) {
  std::stringstream ss;
  ss << "cublasAssert: " << cublasGetErrorString(code) << " in "
    << std::string(file) << ", line " << line << ".";
  mexErrMsgTxt(ss.str().c_str());
 }

 return code == CUBLAS_STATUS_SUCCESS;
}

inline bool cudaAssert(cudaError_t code, const char* file, int line) {
 if (code != cudaSuccess) {
  std::stringstream ss;
  ss << "cudaAssert: " << cudaGetErrorString(code) << " in "
    << std::string(file) << ", line " << line << ".";
  mexErrMsgTxt(ss.str().c_str());
 }

 return code == cudaSuccess;
}

inline bool mexGPUAssert(int code, const char* file, int line) {
 if (code != MX_GPU_SUCCESS) {
  std::stringstream ss;
  ss << "mexGPUAssert: could not initialize the Mathworks GPU API in "
    << std::string(file) << ", line " << line << ".";
  mexErrMsgTxt(ss.str().c_str());
 }

 return code == MX_GPU_SUCCESS;
}

#define cublasErrchk(ans) { cublasAssert((ans), __FILE__, __LINE__); }
#define cudaErrchk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
#define mxGPUErrchk(ans) { mexGPUAssert((ans), __FILE__, __LINE__); }

void mexFunction(int nlhs, mxArray *plhs[], /* Output variables */int nrhs,
  const mxArray *prhs[]) /* Input variables */{
 if (nrhs != 1) { /* end if not one function arguments */
  mexErrMsgTxt("This function requires one input argument.");
  return;
 }

 if (nlhs > 3) { /* take three outputs */
  mexErrMsgTxt("This function takes a maximum of three output variables.");
  return;
 }

 mxGPUErrchk(mxInitGPU());

 const mxGPUArray* in1_gpu = mxGPUCreateFromMxArray(prhs[0]);
 size_t ndims = mxGPUGetNumberOfDimensions(in1_gpu);
 const size_t* dim = (const size_t*) mxGPUGetDimensions(in1_gpu);

 if (ndims != 3) { /* end if input arguments are of different dimensions */
  mexErrMsgTxt("The input argument must be a 3-dimensional array.");
  return;
 }

 cublasHandle_t handle;

 cublasErrchk(cublasCreate(&handle));

 int no_matrices = dim[2];
 int nrow = dim[0];
 int ncol = dim[1];
 int matrix_size = nrow * ncol;
 size_t i;

 std::stringstream ss;
 ss << "dim[2] = " << dim[2] << "\nno_matrices = " << no_matrices << "\nnrow = " << nrow << "\nmatrix_size = " << nrow << " x " << ncol << " = " << matrix_size << std::endl;
 mexPrintf(ss.str().c_str());

 mxGPUArray* gpu_array_inout = mxGPUCopyFromMxArray(prhs[0]);
 double* inout_storage = (double*) mxGPUGetData(gpu_array_inout);

 size_t info_dimensions[1] = { no_matrices };
 mxGPUArray* gpu_array_info = mxGPUCreateGPUArray(1, (mwSize*) info_dimensions, mxINT32_CLASS, mxREAL,
   MX_GPU_INITIALIZE_VALUES);
 int* out_info = (int*) mxGPUGetData(gpu_array_info);

 mexPrintf("after defining gpu_array_info\n");

 size_t pivot_dimensions[2] = { nrow, no_matrices };
 mxGPUArray* gpu_array_pivot = mxGPUCreateGPUArray(2, (mwSize*) pivot_dimensions, mxINT32_CLASS, mxREAL,
   MX_GPU_DO_NOT_INITIALIZE);
 int* out_pivot = (int*) mxGPUGetData(gpu_array_pivot);

 mexPrintf("after defining gpu_array_pivot\n");

 double** inout_pointers_CPU = (double**) malloc(no_matrices * sizeof(double*));
 for (i = 0; i < no_matrices; i++) {
  inout_pointers_CPU[i] = (double*) ((char*) inout_storage + i * ((size_t) matrix_size) * sizeof(double));
 }
 double** inout_pointers_GPU;
 cudaErrchk(cudaMalloc((void** )&inout_pointers_GPU, no_matrices * sizeof(double*)));
 cudaErrchk(
   cudaMemcpy(inout_pointers_GPU, inout_pointers_CPU, no_matrices * sizeof(double*), cudaMemcpyHostToDevice));
 free(inout_pointers_CPU);

 ss.clear();
 ss << "check again before calling cublasDgetrfBatched:\nnrow = " << nrow << "\nno_matrices = " << no_matrices << std::endl;
 mexPrintf(ss.str().c_str());

 cublasErrchk(cublasDgetrfBatched(handle, nrow, inout_pointers_GPU, nrow, out_pivot, out_info, no_matrices));

 cublasErrchk(cublasDestroy(handle));

 cudaErrchk(cudaFree(inout_pointers_GPU));

 if (mxIsGPUArray(prhs[0])) {
  plhs[0] = mxGPUCreateMxArrayOnGPU(gpu_array_inout);
  if (nlhs > 1) {
   plhs[1] = mxGPUCreateMxArrayOnGPU(gpu_array_pivot);
   if (nlhs > 2) {
    plhs[2] = mxGPUCreateMxArrayOnGPU(gpu_array_info);
   }
  }
 } else {
  plhs[0] = mxGPUCreateMxArrayOnCPU(gpu_array_inout);
  if (nlhs > 1) {
   plhs[1] = mxGPUCreateMxArrayOnCPU(gpu_array_pivot);
   if (nlhs > 2) {
    plhs[2] = mxGPUCreateMxArrayOnCPU(gpu_array_info);
   }
  }
 }

 mxGPUDestroyGPUArray(gpu_array_inout);
 mxGPUDestroyGPUArray(gpu_array_pivot);
 mxGPUDestroyGPUArray(gpu_array_info);
 mxFree((void*) dim);

 return;
}

我编译如下：

mex -L/usr/local/cuda/lib64 -lcudart -lcublas gpuBatchedLU.cu

我从MATLAB打来电话：

[a1,b1,c1]=gpuBatchedLU(randn(32,32,5)); %no problem
[a2,b2,c2]=gpuBatchedLU(randn(33,33,5)); %produces CUBLAS_STATUS_INVALID_VALUE

我使用Matlab R2013b与并行工具箱，Cuda 5.5和NVS 5200M图形芯片。

任何人都可以复制这个问题吗？我很感激有关如何解决这个问题的任何建议。

Answer 1

问题似乎是在5.0版本中使用libcublas.so的Matlab R2013b。文件链接位于/MATLAB/R2013b/bin/glnxa64/。一旦我将链接更改为我的Cuda 5.5安装的libcublas.so，它就可以正常工作。

cublas cublasDgetrfBatched（）批量LU分解不适用于大于32x32的矩阵

1 个答案: