Question

我试图对Csrmv_mp函数进行简单测试。我有一个工作程序，但是我得到了一对特定的矩阵向量的错误结果。如果我运行完全相同的程序，但使用Csrmv我得到正确的结果。

这是我的代码（它是cusparse库附录C中示例的简化版本：http://docs.nvidia.com/cuda/cusparse/index.html#csrmv_examples）：

/*
* How to compile (assume cuda is installed at /usr/local/cuda/)
* nvcc -c -I/usr/local/cuda/include csrmvmp_example.cpp
* g++ -fopenmp -o csrmvmp_example csrmvmp_example.o -L/usr/local/cuda/lib64 -
lcublas -lcusparse -lcudart
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusparse.h>

void printMatrix(int m, int n, const double*A, int lda, const char* name)
{
    for (int row = 0; row < m; row++) {
        for (int col = 0; col < n; col++) {
            double Areg = A[row + col*lda];
            printf("%s(%d,%d) = %f\n", name, row + 1, col + 1, Areg);
        }
    }
}
int main(int argc, char*argv[])
{
    cublasHandle_t cublasH = NULL;
    cusparseHandle_t cusparseH = NULL;
    cudaStream_t stream = NULL;
    cusparseMatDescr_t descrA = NULL;
    cublasStatus_t cublasStat = CUBLAS_STATUS_SUCCESS;
    cusparseStatus_t cusparseStat = CUSPARSE_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;
    cudaError_t cudaStat2 = cudaSuccess;
    cudaError_t cudaStat3 = cudaSuccess;
    cudaError_t cudaStat4 = cudaSuccess;
    cudaError_t cudaStat5 = cudaSuccess;

    const int n = 3;
    const int nnzA = 6;
    /*
    *       | 0 1 2 |
    *   A = | 1 0 3 |
    *       | 2 3 0 |
    *   

    * Initial vector
    *
    *       | 1/3 |
    *   v = | 1/3 |
    *       | 1/3 |
    * 
    */

    const int csrRowPtrA[n + 1] = { 0, 2, 4, 6 };
    const int csrColIndA[nnzA] = { 1, 2, 0, 2, 0, 1 };
    const double csrValA[nnzA] = { 1.0, 2.0, 1.0, 3.0, 2.0, 3.0 };

    const double x0[n] = { 1.0/3.0, 1.0/3.0, 1.0/3.0 }; /* initial guess */

    double x[n]; /* numerical eigenvector */
    int *d_csrRowPtrA = NULL;
    int *d_csrColIndA = NULL;
    double *d_csrValA = NULL;
    double *d_x = NULL; /* eigenvector */
    double *d_y = NULL; /* workspace */
    const double tol = 1.e-6;
    const int max_ites = 30;
    const double h_one = 1.0;
    const double h_zero = 0.0;

    /* step 1: create cublas/cusparse handle, bind a stream */
    cudaStat1 = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    assert(cudaSuccess == cudaStat1);
    cublasStat = cublasCreate(&cublasH);
    assert(CUBLAS_STATUS_SUCCESS == cublasStat);
    cublasStat = cublasSetStream(cublasH, stream);
    assert(CUBLAS_STATUS_SUCCESS == cublasStat);
    cusparseStat = cusparseCreate(&cusparseH);
    assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
    cusparseStat = cusparseSetStream(cusparseH, stream);
    assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);


    /* step 2: configuration of matrix A */
    cusparseStat = cusparseCreateMatDescr(&descrA);
    assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);
    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);


    /* step 3: copy A and x0 to device */
    cudaStat1 = cudaMalloc((void**)&d_csrRowPtrA, sizeof(int) * (n + 1));
    cudaStat2 = cudaMalloc((void**)&d_csrColIndA, sizeof(int) * nnzA);
    cudaStat3 = cudaMalloc((void**)&d_csrValA, sizeof(double) * nnzA);
    cudaStat4 = cudaMalloc((void**)&d_x, sizeof(double) * n);
    cudaStat5 = cudaMalloc((void**)&d_y, sizeof(double) * n);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);
    assert(cudaSuccess == cudaStat4);
    assert(cudaSuccess == cudaStat5);
    cudaStat1 = cudaMemcpy(d_csrRowPtrA, csrRowPtrA, sizeof(int) * (n + 1),
        cudaMemcpyHostToDevice);
    cudaStat2 = cudaMemcpy(d_csrColIndA, csrColIndA, sizeof(int) * nnzA,
        cudaMemcpyHostToDevice);
    cudaStat3 = cudaMemcpy(d_csrValA, csrValA, sizeof(double) * nnzA,
        cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);


    /*
    * 4.1: initial guess x0
    */

    cudaStat1 = cudaMemcpy(d_x, x0, sizeof(double) * n, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);

    /*
    * 4.3: y = A*x
    */
    cusparseStat = cusparseDcsrmv_mp(cusparseH, CUSPARSE_OPERATION_NON_TRANSPOSE, n, n, nnzA, &h_one, descrA, d_csrValA, d_csrRowPtrA, d_csrColIndA, d_x, &h_zero, d_y);
    assert(CUSPARSE_STATUS_SUCCESS == cusparseStat);


    /*
    * step 5: report result
    */
    cudaStat1 = cudaMemcpy(x, d_y, sizeof(double) * n, cudaMemcpyDeviceToHost);
    assert(cudaSuccess == cudaStat1);
    printf("vector = \n");
    printMatrix(n, 1, x, n, "V0");
    printf("=====\n");


    /* free resources */
    if (d_csrRowPtrA) cudaFree(d_csrRowPtrA);
    if (d_csrColIndA) cudaFree(d_csrColIndA);
    if (d_csrValA) cudaFree(d_csrValA);
    if (d_x) cudaFree(d_x);
    if (d_y) cudaFree(d_y);
    if (cublasH) cublasDestroy(cublasH);
    if (cusparseH) cusparseDestroy(cusparseH);
    if (stream) cudaStreamDestroy(stream);
    if (descrA) cusparseDestroyMatDescr(descrA);
    cudaDeviceReset();
    return 0;
}

得到的矢量是{1,1,1}，但是手动或使用Csrmv函数进行计算，结果得到矢量{1,4 / 3,5 / 3}。我真的不明白为什么我有这个问题。我唯一能想到的是我错误地用它的CSR格式写了矩阵。另外，我没有使用CUSPARSE_MATRIX_TYPE_SYMMETRIC，因为该函数不接受这种类型的matix（文档错误）。

如果有人可以帮助我，我会非常感激。

编辑：我使用的是CUDA 9.0，我的操作系统Windows 10 Home和我的GPU是GTX 960m

Answer 1

我刚刚更新到CUDA 9.1，正如Robert Crovella所说，这个bug已经解决了。

使用cusparse Csrmv_mp但使用cusparse Csrmv的正确结果时得到错误的结果

1 个答案: