CUDA:使用CUSPARSE csrmv()例程

时间:2016-07-16 14:38:14

标签: c++ cuda nvidia allocation

我目前正在尝试使用CUSPARSE库来加速HPCG实施。但是,在设备数据分配过程中,我似乎犯了一些错误。

这是导致 CUSPARSE_STATUS_MAPPING_ERROR 的代码段:

int HPC_sparsemv( CRS_Matrix *A_crs_d, 
      FP * x_d, FP * y_d)
{
FP alpha = 1.0f;
FP beta = 0.0f;

FP* vals = A_crs_d->vals;
int* inds = A_crs_d->col_ind;
int* row_ptr = A_crs_d->row_ptr;

/*generate Matrix descriptor for SparseMV computation*/
cusparseMatDescr_t matDescr;
cusparseCreateMatDescr(&matDescr);

cusparseStatus_t status;

/*hand off control to CUSPARSE routine*/

#ifdef DOUBLE

status = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows,
    A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr, 
    inds, x_d, &beta, y_d); 


#else

status = cusparseScsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows, 
            A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr,
            col_ind, x_d, &beta, y_d); 

#endif

注意:FP是由条件编译保护包装的typedef,这意味着它在编译时被评估为float或double别名。

这是处理数据分配的函数:

int cudaAlloc(FP* r_d, FP* p_d, FP* Ap_d, FP* b_d, const FP* const b, FP * x_d, FP * const x, 
    struct CRS_Matrix* A_crs_d, int nrows, int ncols, int nnz){

std::cout << "Beginning device allocation..." << std::endl; 

int size_r = nrows * sizeof(FP);
int size_c = ncols * sizeof(FP);
int size_nnz = nnz * sizeof(FP);

int allocStatus = 0;

/*device alloc r_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &r_d, size_r) );



/*device alloc p_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &p_d, size_c) );


/*device alloc Ap_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &Ap_d, size_r) );


/*device alloc b_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &b_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(b_d, b, size_r, cudaMemcpyHostToDevice));

/*device alloc x_d*/
allocStatus |= (int) checkCuda( cudaMalloc((void **) &x_d, size_r ) );
allocStatus |= (int) checkCuda( cudaMemcpy(x_d, x, size_r, cudaMemcpyHostToDevice));


/*device alloc A_crs_d*/
FP * valtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &valtmp, size_nnz) );
allocStatus |= (int) checkCuda( cudaMemcpy(valtmp, CRS->vals, size_nnz, cudaMemcpyHostToDevice) );


int * indtmp;
allocStatus |= (int) checkCuda( cudaMalloc((void **) &indtmp, nnz* sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(indtmp, CRS->col_ind, 
nnz * sizeof(int) , cudaMemcpyHostToDevice) );


int * rowtmp; 
allocStatus |= (int) checkCuda( cudaMalloc((void **) &rowtmp,  (nrows + 1) * sizeof(int)) );
allocStatus |= (int) checkCuda( cudaMemcpy(rowtmp, CRS->row_ptr, 
(nrows + 1) * sizeof(int), cudaMemcpyHostToDevice) );


allocStatus |= (int) checkCuda( cudaMallocHost( &A_crs_d, sizeof(CRS_Matrix)) );

A_crs_d->vals = valtmp;
A_crs_d->col_ind = indtmp;
A_crs_d->row_ptr = rowtmp;

A_crs_d->nrows = CRS->nrows;
A_crs_d->ncols = CRS->ncols;
A_crs_d->nnz = CRS->nnz;

std::cout << "Device allocation done." << std::endl;

return  allocStatus;
}

在我第一次停在StackOverflow期间,我发现其他人发布了这个已解决的问题:Cusparse status mapping error while using cuda constant memory

但是,因为我没有在传递给csrmv()的参数上使用常量内存,而这些内存并没有解决我的问题。我还检查了数据完整性,设备上的CRS_Matrix与主机内存中的原始内容完全匹配。

我对此问题感到很茫然,并且无法找到任何可能表明CUDA工具包文档存在问题的内容,因此我们将非常感谢您的帮助。

提前致谢。

1 个答案:

答案 0 :(得分:0)

您所显示的代码中存在一些错误。

  1. 不可能将指针参数传递给例程,对该指针执行cudaMalloc操作,然后期望该结果显示在调用环境中。您对x_d传递给b_d的{​​{1}},A_crs_dcudaMallocHost(以及cudaAlloc)参数执行此操作。一种可能的解决方法是将这些参数作为例程中的双指针(**)参数处理,并将指针的地址传递给例程。这允许修改的指针值显示在调用环境中。这实际上是一个正确的C编码问题,并不是特定于CUDA。

  2. 至少就cudaAlloc而言,您似乎打算实施Ax=b。在这种情况下,x向量的长度是A的数量,b向量的长度是 A。在cudaAlloc例程中,您将这两个分配为A行的大小,因此这可能不正确。这也会影响后续cudaMemcpy操作(大小)。

  3. 您显示的代码似乎仅针对double情况进行了测试,因为您传递给每个调用的colum index参数存在差异(可能是float和{{1} }})。无论如何,我已经围绕你所展示的内容构建了一个完整的代码(针对double案例),加上上面的更改,它运行时没有错误,并为我生成了正确的结果:

    double

    注意:

    1. 目前还不清楚您在$ cat t1216.cu #include <cusparse.h> #include <iostream> #define checkCuda(x) x #ifdef USE_FLOAT typedef float FP; #else #define DOUBLE typedef double FP; #endif struct CRS_Matrix{ FP *vals; int *col_ind; int *row_ptr; int ncols; int nnz; int nrows; } *CRS; cusparseHandle_t cuspHandle; int HPC_sparsemv( CRS_Matrix *A_crs_d, FP * x_d, FP * y_d) { FP alpha = 1.0f; FP beta = 0.0f; FP* vals = A_crs_d->vals; int* inds = A_crs_d->col_ind; int* row_ptr = A_crs_d->row_ptr; /*generate Matrix descriptor for SparseMV computation*/ cusparseMatDescr_t matDescr; cusparseCreateMatDescr(&matDescr); cusparseStatus_t status; /*hand off control to CUSPARSE routine*/ #ifdef DOUBLE status = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows, A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr, inds, x_d, &beta, y_d); #else status = cusparseScsrmv(cuspHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A_crs_d->nrows, A_crs_d->ncols,A_crs_d->nnz, &alpha, matDescr, vals, row_ptr, col_ind, x_d, &beta, y_d); // col_ind here should probably be inds #endif return (int)status; } int cudaAlloc(FP* r_d, FP* p_d, FP* Ap_d, FP** b_d, const FP* const b, FP ** x_d, FP * const x, struct CRS_Matrix** A_crs_d, int nrows, int ncols, int nnz){ std::cout << "Beginning device allocation..." << std::endl; int size_r = nrows * sizeof(FP); int size_c = ncols * sizeof(FP); int size_nnz = nnz * sizeof(FP); int allocStatus = 0; /*device alloc r_d*/ allocStatus |= (int) checkCuda( cudaMalloc((void **) &r_d, size_r) ); /*device alloc p_d*/ allocStatus |= (int) checkCuda( cudaMalloc((void **) &p_d, size_c) ); /*device alloc Ap_d*/ allocStatus |= (int) checkCuda( cudaMalloc((void **) &Ap_d, size_r) ); /*device alloc b_d*/ allocStatus |= (int) checkCuda( cudaMalloc((void **) b_d, size_r ) ); allocStatus |= (int) checkCuda( cudaMemcpy(*b_d, b, size_r, cudaMemcpyHostToDevice)); /*device alloc x_d*/ allocStatus |= (int) checkCuda( cudaMalloc((void **) x_d, size_c ) ); allocStatus |= (int) checkCuda( cudaMemcpy(*x_d, x, size_c, cudaMemcpyHostToDevice)); /*device alloc A_crs_d*/ FP * valtmp; allocStatus |= (int) checkCuda( cudaMalloc((void **) &valtmp, size_nnz) ); allocStatus |= (int) checkCuda( cudaMemcpy(valtmp, CRS->vals, size_nnz, cudaMemcpyHostToDevice) ); int * indtmp; allocStatus |= (int) checkCuda( cudaMalloc((void **) &indtmp, nnz* sizeof(int)) ); allocStatus |= (int) checkCuda( cudaMemcpy(indtmp, CRS->col_ind, nnz * sizeof(int) , cudaMemcpyHostToDevice) ); int * rowtmp; allocStatus |= (int) checkCuda( cudaMalloc((void **) &rowtmp, (nrows + 1) * sizeof(int)) ); allocStatus |= (int) checkCuda( cudaMemcpy(rowtmp, CRS->row_ptr, (nrows + 1) * sizeof(int), cudaMemcpyHostToDevice) ); allocStatus |= (int) checkCuda( cudaMallocHost( A_crs_d, sizeof(CRS_Matrix)) ); (*A_crs_d)->vals = valtmp; (*A_crs_d)->col_ind = indtmp; (*A_crs_d)->row_ptr = rowtmp; (*A_crs_d)->nrows = CRS->nrows; (*A_crs_d)->ncols = CRS->ncols; (*A_crs_d)->nnz = CRS->nnz; std::cout << "Device allocation done." << std::endl; return allocStatus; } int main(){ CRS = (struct CRS_Matrix *)malloc(sizeof(struct CRS_Matrix)); cusparseCreate(&cuspHandle); // simple test matrix #define M0_M 5 #define M0_N 5 FP m0_csr_vals[] = {2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f, 1.0f, 1.0f, 2.0f}; int m0_col_idxs[] = { 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4}; int m0_row_ptrs[] = { 0, 2, 5, 8, 11, 13}; FP m0_d[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; int m0_nnz = 13; FP *r_d, *p_d, *Ap_d, *b_d, *x_d; FP *b = new FP[M0_N]; CRS_Matrix *A_crs_d; CRS->vals = m0_csr_vals; CRS->col_ind = m0_col_idxs; CRS->row_ptr = m0_row_ptrs; CRS->nrows = M0_M; CRS->ncols = M0_N; CRS->nnz = m0_nnz; // Ax = b // r_d, p_d, Ap_d ?? int stat = cudaAlloc(r_d, p_d, Ap_d, &b_d, b, &x_d, m0_d, &A_crs_d, M0_M, M0_N, m0_nnz); std::cout << "cudaAlloc status: " << stat << std::endl; stat = HPC_sparsemv( A_crs_d, x_d, b_d); std::cout << "HPC_sparsemv status: " << stat << std::endl; FP *results = new FP[M0_M]; cudaMemcpy(results, b_d, M0_M*sizeof(FP), cudaMemcpyDeviceToHost); std::cout << "Results:" << std::endl; for (int i = 0; i < M0_M; i++) std::cout << results[i] << std::endl; return 0; } $ nvcc -o t1216 t1216.cu -lcusparse t1216.cu(153): warning: variable "r_d" is used before its value is set t1216.cu(153): warning: variable "p_d" is used before its value is set t1216.cu(153): warning: variable "Ap_d" is used before its value is set t1216.cu(153): warning: variable "r_d" is used before its value is set t1216.cu(153): warning: variable "p_d" is used before its value is set t1216.cu(153): warning: variable "Ap_d" is used before its value is set $ cuda-memcheck ./t1216 ========= CUDA-MEMCHECK Beginning device allocation... Device allocation done. cudaAlloc status: 0 HPC_sparsemv status: 0 Results: 3 4 4 4 3 ========= ERROR SUMMARY: 0 errors $ 例程中对r_dp_dAp_d的意图。我原样离开了他们。但如果您打算将它们用于某些事情,它们可能会受到我在上面1中描述的问题的影响。

    2. 如上所述,您传递给cudaAlloc中的cusparse例程的参数中的floatdouble似乎不一致。特别是,列索引参数不匹配,HPC_sparsemv版本对我来说似乎很明智,所以我使用了它。如果您使用double,则可能需要修改该参数。

    3. 将来,我建议您提供完整的代码,就像我展示的那样,以证明失败。代码并不比你已经展示的代码多得多,而且它会让其他人更容易帮助你。