我试图对Csrmv_mp函数进行简单测试。我有一个工作程序,但是我得到了一对特定的矩阵向量的错误结果。如果我运行完全相同的程序,但使用Csrmv我得到正确的结果。
这是我的代码(它是cusparse库附录C中示例的简化版本:http://docs.nvidia.com/cuda/cusparse/index.html#csrmv_examples):
/*
* How to compile (assume cuda is installed at /usr/local/cuda/)
* nvcc -c -I/usr/local/cuda/include csrmvmp_example.cpp
* g++ -fopenmp -o csrmvmp_example csrmvmp_example.o -L/usr/local/cuda/lib64 -
lcublas -lcusparse -lcudart
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse.h>
void printMatrix(int m, int n, const double*A, int lda, const char* name)
{
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
double Areg = A[row + col*lda];
printf("%s(%d,%d) = %f\n", name, row + 1, col + 1, Areg);
}
}
}
int main(int argc, char*argv[])
{
cublasHandle_t cublasH = NULL;
cusparseHandle_t cusparseH = NULL;
cudaStream_t stream = NULL;
cusparseMatDescr_t descrA = NULL;
cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS;
cusparseStatus_t cusparseStat = CUSPARSE_STATUS_SUCCESS;
cudaError_t cudaStat1 = cudaSuccess;
cudaError_t cudaStat2 = cudaSuccess;
cudaError_t cudaStat3 = cudaSuccess;
cudaError_t cudaStat4 = cudaSuccess;
cudaError_t cudaStat5 = cudaSuccess;
const int n = 3;
const int nnzA = 6;
/*
* | 0 1 2 |
* A = | 1 0 3 |
* | 2 3 0 |
*
* Initial vector
*
* | 1/3 |
* v = | 1/3 |
* | 1/3 |
*
*/
const int csrRowPtrA[n + 1] = { 0, 2, 4, 6 };
const int csrColIndA[nnzA] = { 1, 2, 0, 2, 0, 1 };
const double csrValA[nnzA] = { 1.0, 2.0, 1.0, 3.0, 2.0, 3.0 };
const double x0[n] = { 1.0/3.0, 1.0/3.0, 1.0/3.0 }; /* initial guess */
double x[n]; /* numerical eigenvector */
int *d_csrRowPtrA = NULL;
int *d_csrColIndA = NULL;
double *d_csrValA = NULL;
double *d_x = NULL; /* eigenvector */
double *d_y = NULL; /* workspace */
const double tol = 1.e-6;
const int max_ites = 30;
const double h_one = 1.0;
const double h_zero = 0.0;
/* step 1: create cublas/cusparse handle, bind a stream */
cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
assert(cudaSuccess == cudaStat1);
cublasStat = cublasCreate(&cublasH);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
cublasStat = cublasSetStream(cublasH, stream);
assert(CUBLAS_STATUS_SUCCESS == cublasStat);
cusparseStat = cusparseCreate(&cusparseH);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
cusparseStat = cusparseSetStream(cusparseH, stream);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
/* step 2: configuration of matrix A */
cusparseStat = cusparseCreateMatDescr(&descrA);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
/* step 3: copy A and x0 to device */
cudaStat1 = cudaMalloc((void**)&d_csrRowPtrA, sizeof(int) * (n + 1));
cudaStat2 = cudaMalloc((void**)&d_csrColIndA, sizeof(int) * nnzA);
cudaStat3 = cudaMalloc((void**)&d_csrValA, sizeof(double) * nnzA);
cudaStat4 = cudaMalloc((void**)&d_x, sizeof(double) * n);
cudaStat5 = cudaMalloc((void**)&d_y, sizeof(double) * n);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
assert(cudaSuccess == cudaStat4);
assert(cudaSuccess == cudaStat5);
cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int) * (n + 1),
cudaMemcpyHostToDevice);
cudaStat2 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int) * nnzA,
cudaMemcpyHostToDevice);
cudaStat3 = cudaMemcpy(d_csrValA, csrValA, sizeof(double) * nnzA,
cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
assert(cudaSuccess == cudaStat2);
assert(cudaSuccess == cudaStat3);
/*
* 4.1: initial guess x0
*/
cudaStat1 = cudaMemcpy(d_x, x0, sizeof(double) * n, cudaMemcpyHostToDevice);
assert(cudaSuccess == cudaStat1);
/*
* 4.3: y = A*x
*/
cusparseStat = cusparseDcsrmv_mp(cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, n, n, nnzA, &h_one, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, d_x, &h_zero, d_y);
assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
/*
* step 5: report result
*/
cudaStat1 = cudaMemcpy(x, d_y, sizeof(double) * n, cudaMemcpyDeviceToHost);
assert(cudaSuccess == cudaStat1);
printf("vector = \n");
printMatrix(n, 1, x, n, "V0");
printf("=====\n");
/* free resources */
if (d_csrRowPtrA) cudaFree(d_csrRowPtrA);
if (d_csrColIndA) cudaFree(d_csrColIndA);
if (d_csrValA) cudaFree(d_csrValA);
if (d_x) cudaFree(d_x);
if (d_y) cudaFree(d_y);
if (cublasH) cublasDestroy(cublasH);
if (cusparseH) cusparseDestroy(cusparseH);
if (stream) cudaStreamDestroy(stream);
if (descrA) cusparseDestroyMatDescr(descrA);
cudaDeviceReset();
return 0;
}
得到的矢量是{1,1,1},但是手动或使用Csrmv函数进行计算,结果得到矢量{1,4 / 3,5 / 3}。 我真的不明白为什么我有这个问题。我唯一能想到的是我错误地用它的CSR格式写了矩阵。另外,我没有使用CUSPARSE_MATRIX_TYPE_SYMMETRIC,因为该函数不接受这种类型的matix(文档错误)。
如果有人可以帮助我,我会非常感激。
编辑: 我使用的是CUDA 9.0,我的操作系统Windows 10 Home和我的GPU是GTX 960m
答案 0 :(得分:1)
我刚刚更新到CUDA 9.1,正如Robert Crovella所说,这个bug已经解决了。