使用cuSPARSE在CUDA中进行稀疏矩阵 - 矩阵乘法

时间:2015-04-17 01:21:21

标签: cuda nvidia sparse-matrix gpu-programming

我使用cuSPARSE库对Nvidia K40上的稀疏矩阵 - 矩阵乘法进行基准测试。 我正在以CSR格式创建自己的稀疏矩阵,并且我正在使用cusparseXcsrgemmNnz库的cuSPARSE例程。 但是,当我增加数据大小时,调用cusparseXcsrgemmNnz时会发生错误,即不返回CUSPARSE_STATUS_SUCCESS。 此外,cudaMemcpy失败,并且未返回CUDA_SUCCESS。 该代码适用于8 x 816 x 16矩阵。但是,从32 x 32开始,会发现此错误。

修改:我从CUSPARSE_STATUS_EXECUTION_FAILED收到cusparseXcsrgemmNnz第三个矩阵大小。对于前两个矩阵大小,执行正确。

#include <cusparse_v2.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>

// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}

#define cudaCheckErrors(msg) \
do { \
    cudaError_t __err = cudaGetLastError(); \
    if (__err != cudaSuccess) { \
        fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
            msg, cudaGetErrorString(__err), \
            __FILE__, __LINE__); \
        fprintf(stderr, "*** FAILED - ABORTING\n"); \
        exit(1); \
    } \
} while (0)


double timerval()
{
    struct timeval st;
    gettimeofday(&st, NULL);
    return (st.tv_sec+st.tv_usec*1e-6);
}

// perform sparse-matrix multiplication C=AxB
int main(){ 
double avg_time = 0, s_time, e_time;

cusparseStatus_t stat;
cusparseHandle_t hndl;
cusparseMatDescr_t descrA, descrB, descrC;
int *csrRowPtrA, *csrRowPtrB, *csrRowPtrC, *csrColIndA, *csrColIndB, *csrColIndC;
int *h_csrRowPtrA, *h_csrRowPtrB, *h_csrRowPtrC, *h_csrColIndA, *h_csrColIndB, *h_csrColIndC,*pos;
float *csrValA, *csrValB, *csrValC, *h_csrValA, *h_csrValB, *h_csrValC;
int nnzA, nnzB, nnzC;
int m=4,n,k,loop;
int i,j;
int iterations;
for (iterations=0;iterations<10;iterations++)
{
    m *=2;
    n = m;
    k = m;
    //density of the sparse matrix to be created. Assume 5% density.
    double dense_const = 0.05;
    int temp5, temp6,temp3,temp4;
    int density=(m*n)*(dense_const);
    nnzA = density;
    nnzB = density;
    h_csrRowPtrA = (int *)malloc((m+1)*sizeof(int));
    h_csrRowPtrB = (int *)malloc((n+1)*sizeof(int));
    h_csrColIndA = (int *)malloc(density*sizeof(int));
    h_csrColIndB = (int *)malloc(density*sizeof(int));
    h_csrValA  = (float *)malloc(density*sizeof(float));
    h_csrValB  = (float *)malloc(density*sizeof(float));
    if ((h_csrRowPtrA == NULL) || (h_csrRowPtrB == NULL) || (h_csrColIndA == NULL) || (h_csrColIndB == NULL) || (h_csrValA == NULL) || (h_csrValB == NULL))
    {printf("malloc fail\n"); return -1;}

    //position array for random initialisation of positions in input matrix
    pos= (int *)calloc((m*n), sizeof(int));
    int temp,temp1;

    //  printf("the density is %d\n",density);
    //  printf("check 1:\n");

    //randomly initialise positions
    for(i=0;i<density;i++)
    {
        temp1=rand()%(m*n);
        pos[i]=temp1;

    }
    //  printf("check 2:\n");

    //sort the 'pos' array
    for (i = 0 ; i < density; i++) {
        int d = i;
        int t;

        while ( d > 0 && pos[d] < pos[d-1]) {
            t          = pos[d];
            pos[d]   = pos[d-1];
            pos[d-1] = t;        
            d--;
        }
    }
    // initialise with non zero elements and extract column and row ptr vector
    j=1;
    //ja[0]=1;

    int p=0;
    int f=0;

    for(i = 0; i < density; i++)
    {
        temp=pos[i];
         h_csrValA[f] = rand();
         h_csrValB[f] = rand();
         h_csrColIndA[f] = temp%m;
         h_csrColIndB[f] = temp%m;
        f++;
        p++;
        temp5= pos[i];
        temp6=pos[i+1];
        temp3=temp5-(temp5%m);
        temp4=temp6-(temp6%m);

        if(!(temp3== temp4))
        {   
            if((temp3+m==temp6))
            {}
            else    
            {   
                h_csrRowPtrA[j]=p;
                h_csrRowPtrB[j]=p;
                j++;
            }

        }       
    }

    // transfer data to device

    cudaMalloc(&csrRowPtrA, (m+1)*sizeof(int));
    cudaMalloc(&csrRowPtrB, (n+1)*sizeof(int));
    cudaMalloc(&csrColIndA, density*sizeof(int));
    cudaMalloc(&csrColIndB, density*sizeof(int));
    cudaMalloc(&csrValA, density*sizeof(float));
    cudaMalloc(&csrValB, density*sizeof(float));
    cudaCheckErrors("cudaMalloc fail");
    cudaMemcpy(csrRowPtrA, h_csrRowPtrA, (m+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrRowPtrB, h_csrRowPtrB, (n+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrColIndA, h_csrColIndA, density*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrColIndB, h_csrColIndB, density*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrValA, h_csrValA, density*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(csrValB, h_csrValB, density*sizeof(float), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy fail");

    // set cusparse matrix types
    CUSPARSE_CHECK(cusparseCreate(&hndl));
    stat = cusparseCreateMatDescr(&descrA);
    CUSPARSE_CHECK(stat);
    stat = cusparseCreateMatDescr(&descrB);
    CUSPARSE_CHECK(stat);
    stat = cusparseCreateMatDescr(&descrC);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(stat);
    cusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
    cusparseOperation_t transB = CUSPARSE_OPERATION_NON_TRANSPOSE;

    // figure out size of C
    int baseC;
    // nnzTotalDevHostPtr points to host memory
    int *nnzTotalDevHostPtr = &nnzC;
    stat = cusparseSetPointerMode(hndl, CUSPARSE_POINTER_MODE_HOST);
    CUSPARSE_CHECK(stat);
    cudaMalloc((void**)&csrRowPtrC, sizeof(int)*(m+1));
    cudaCheckErrors("cudaMalloc fail");

    s_time=timerval();

    stat = cusparseXcsrgemmNnz(hndl, transA, transB, m, n, k,
    descrA, nnzA, csrRowPtrA, csrColIndA,
    descrB, nnzB, csrRowPtrB, csrColIndB,
    descrC, csrRowPtrC, nnzTotalDevHostPtr );
    CUSPARSE_CHECK(stat);
    if (NULL != nnzTotalDevHostPtr){
    nnzC = *nnzTotalDevHostPtr;}
    else{
    cudaMemcpy(&nnzC, csrRowPtrC+m, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&baseC, csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail");
    nnzC -= baseC;}
    cudaMalloc((void**)&csrColIndC, sizeof(int)*nnzC);
    cudaMalloc((void**)&csrValC, sizeof(float)*nnzC);
    cudaCheckErrors("cudaMalloc fail");
    // perform multiplication C = A*B

    for(loop=0;loop<1000;loop++)
    {
        stat = cusparseScsrgemm(hndl, transA, transB, m, n, k,
        descrA, nnzA,
        csrValA, csrRowPtrA, csrColIndA,
        descrB, nnzB,
        csrValB, csrRowPtrB, csrColIndB,
        descrC,
        csrValC, csrRowPtrC, csrColIndC);
        CUSPARSE_CHECK(stat);
    }

    e_time=timerval();

    avg_time=avg_time/1000;
    // copy result (C) back to host
    h_csrRowPtrC = (int *)malloc((m+1)*sizeof(int));
    h_csrColIndC = (int *)malloc(nnzC *sizeof(int));
    h_csrValC  = (float *)malloc(nnzC *sizeof(float));
    if ((h_csrRowPtrC == NULL) || (h_csrColIndC == NULL) || (h_csrValC == NULL))
    {printf("malloc fail\n"); return -1;}
    cudaMemcpy(h_csrRowPtrC, csrRowPtrC, (m+1)*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_csrColIndC, csrColIndC,  nnzC*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_csrValC, csrValC, nnzC*sizeof(float), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail");

    printf ("\n Input size: %d x %d ,Time: %lf and density is %d \n", m,n, avg_time, density); 

    cudaFree(csrRowPtrC);
    cudaFree(csrColIndC);
    cudaFree(csrValC);

    cudaFree(csrRowPtrA);
    cudaFree(csrColIndA);
    cudaFree(csrValA);

    cudaFree(csrRowPtrB);
    cudaFree(csrColIndB);
    cudaFree(csrValB);

    free(h_csrRowPtrC);
    free(h_csrColIndC);
    free(h_csrValC);

    free(h_csrRowPtrA);
    free(h_csrColIndA);
    free(h_csrValA);

    free(h_csrRowPtrB);
    free(h_csrColIndB);
    free(h_csrValB);
}
return 0;

}

2 个答案:

答案 0 :(得分:3)

您似乎从here

中提取了部分代码

如该帖子所示:

  

cusparseXcsrgemmNnz中的失败可能表明CSR矩阵格式化存在潜在问题。

我很确定这是问题所在。生成格式正确的CSR matrix的过程已中断。

为了证明这一点,请在发布的代码中的指定评论之前添加以下代码:

printf("A RowPtrs: \n");
for (int i = 0; i < m+1; i++) printf("%d ", h_csrRowPtrA[i]);
printf("\nA ColInds: \n");
for (int i = 0; i < nnzA; i++) printf("%d ", h_csrColIndA[i]);
printf("\nB RowPtrs: \n");
for (int i = 0; i < n+1; i++) printf("%d ", h_csrRowPtrB[i]);
printf("\nB ColInds: \n");
for (int i = 0; i < nnzB; i++) printf("%d ", h_csrColIndB[i]);
printf("\n");

    // add the above code before this comment:
    // transfer data to device

当我这样做,重新编译并运行时,我得到的结果如下:

$ ./t730
A RowPtrs:
0 1 2 3 0 0 0 0 0
A ColInds:
6 7 1
B RowPtrs:
0 1 2 3 0 0 0 0 0
B ColInds:
6 7 1

 Input size: 8 x 8 ,Time: 0.000000 and density is 3
A RowPtrs:
0 1 2 3 4 5 6 8 9 12 959542853 1886614883 1702064737 1299346243 1918980205 1232301409 1766154068
A ColInds:
11 6 4 12 11 10 2 13 3 2 8 11
B RowPtrs:
-1688500168 1 2 3 4 5 6 8 9 12 0 0 0 0 0 0 0
B ColInds:
11 6 4 12 11 10 2 13 3 2 8 11
cusparse fail: 6, line: 193

我们看到第一组CSR格式的AB矩阵(包括以Input size: 8 x 8...开头的行似乎完成但没有错误,但格式事实上已经破碎了。空行的行指针不是从零开始(不允许行指针向后移动),它们的行指针从最后一个填充行开始(这样每行的非零元素数等于当前行指针)减去前一行指针),行指针序列中的最后一个值指向矩阵中最后一个元素之外的一个元素(即CSR行指针数组中的最后一个值是nnz,非零元素的数量)。

下一组A和B矩阵(对应于16x16通道)明显被打破。至少,A和B CSR格式的矩阵中的行指针明显超出范围。

您创建CSR矩阵的代码刚刚破解。我建议你研究CSR矩阵并创建一个工具来验证你将以这种方式随机创建的任何矩阵。 CUSP具有矩阵验证功能,我相信您可以使用其他CSR矩阵格式验证功能。

答案 1 :(得分:1)

按照Robert Crovella的回答,我想提供一个实现矩阵 - 矩阵稀疏乘法的全功能代码。

为了避免稀疏矩阵格式的任何歧义,代码从密集矩阵开始,并使用cusparse<t>dense2csr将矩阵格式从密集转换为csr。

代码中涉及的两个矩阵是AB。矩阵B是一个置换矩阵。代码计算C = B * A

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>

#include "Utilities.cuh"

#include <cuda_runtime.h>
#include <cusparse_v2.h>

/********/
/* MAIN */
/********/
int main()
{
    // --- Initialize cuSPARSE
    cusparseHandle_t handle;    cusparseSafeCall(cusparseCreate(&handle));

    /**************************/
    /* SETTING UP THE PROBLEM */
    /**************************/
    const int N     = 4;                // --- Number of rows and columns

    // --- Host side dense matrices
    double *h_A_dense = (double*)malloc(N * N * sizeof(*h_A_dense));
    double *h_B_dense = (double*)malloc(N * N * sizeof(*h_B_dense));
    double *h_C_dense = (double*)malloc(N * N * sizeof(*h_C_dense));

    // --- Column-major ordering
    h_A_dense[0] = 0.4612;  h_A_dense[4] = -0.0006;     h_A_dense[8]  = 0.3566;     h_A_dense[12] = 0.0; 
    h_A_dense[1] = -0.0006; h_A_dense[5] = 0.4640;      h_A_dense[9]  = 0.0723;     h_A_dense[13] = 0.0; 
    h_A_dense[2] = 0.3566;  h_A_dense[6] = 0.0723;      h_A_dense[10] = 0.7543;     h_A_dense[14] = 0.0; 
    h_A_dense[3] = 0.;      h_A_dense[7] = 0.0;         h_A_dense[11] = 0.0;        h_A_dense[15] = 0.1; 

    // --- Column-major ordering
    h_B_dense[0] = 0.;      h_B_dense[4] = 0.;          h_B_dense[8]  = 1.;         h_B_dense[12] = 0.; 
    h_B_dense[1] = 1.;      h_B_dense[5] = 0.;          h_B_dense[9]  = 0.;         h_B_dense[13] = 0.; 
    h_B_dense[2] = 0.;      h_B_dense[6] = 1.;          h_B_dense[10] = 0.;         h_B_dense[14] = 0.; 
    h_B_dense[3] = 0.;      h_B_dense[7] = 0.;          h_B_dense[11] = 0.;         h_B_dense[15] = 1.; 

    // --- Create device arrays and copy host arrays to them
    double *d_A_dense;  gpuErrchk(cudaMalloc(&d_A_dense, N * N * sizeof(*d_A_dense)));
    double *d_B_dense;  gpuErrchk(cudaMalloc(&d_B_dense, N * N * sizeof(*d_B_dense)));
    double *d_C_dense;  gpuErrchk(cudaMalloc(&d_C_dense, N * N * sizeof(*d_C_dense)));
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, N * N * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_B_dense, h_B_dense, N * N * sizeof(*d_B_dense), cudaMemcpyHostToDevice));

    // --- Descriptor for sparse matrix A
    cusparseMatDescr_t descrA;      cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSafeCall(cusparseSetMatType     (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
    cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));  

    // --- Descriptor for sparse matrix B
    cusparseMatDescr_t descrB;      cusparseSafeCall(cusparseCreateMatDescr(&descrB));
    cusparseSafeCall(cusparseSetMatType     (descrB, CUSPARSE_MATRIX_TYPE_GENERAL));
    cusparseSafeCall(cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ONE));  

    // --- Descriptor for sparse matrix C
    cusparseMatDescr_t descrC;      cusparseSafeCall(cusparseCreateMatDescr(&descrC));
    cusparseSafeCall(cusparseSetMatType     (descrC, CUSPARSE_MATRIX_TYPE_GENERAL));
    cusparseSafeCall(cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ONE));  

    int nnzA = 0;                           // --- Number of nonzero elements in dense matrix A
    int nnzB = 0;                           // --- Number of nonzero elements in dense matrix B

    const int lda = N;                      // --- Leading dimension of dense matrix

    // --- Device side number of nonzero elements per row of matrix A
    int *d_nnzPerVectorA;   gpuErrchk(cudaMalloc(&d_nnzPerVectorA, N * sizeof(*d_nnzPerVectorA)));
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, &nnzA));

    // --- Device side number of nonzero elements per row of matrix B
    int *d_nnzPerVectorB;   gpuErrchk(cudaMalloc(&d_nnzPerVectorB, N * sizeof(*d_nnzPerVectorB)));
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrB, d_B_dense, lda, d_nnzPerVectorB, &nnzB));

    // --- Host side number of nonzero elements per row of matrix A
    int *h_nnzPerVectorA = (int *)malloc(N * sizeof(*h_nnzPerVectorA));
    gpuErrchk(cudaMemcpy(h_nnzPerVectorA, d_nnzPerVectorA, N * sizeof(*h_nnzPerVectorA), cudaMemcpyDeviceToHost));

    // --- Host side number of nonzero elements per row of matrix B
    int *h_nnzPerVectorB = (int *)malloc(N * sizeof(*h_nnzPerVectorB));
    gpuErrchk(cudaMemcpy(h_nnzPerVectorB, d_nnzPerVectorB, N * sizeof(*h_nnzPerVectorB), cudaMemcpyDeviceToHost));

    printf("Number of nonzero elements in dense matrix A = %i\n\n", nnzA);
    for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorA[i]);
    printf("\n");

    printf("Number of nonzero elements in dense matrix B = %i\n\n", nnzB);
    for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorB[i]);
    printf("\n");

    // --- Device side sparse matrix
    double *d_A;            gpuErrchk(cudaMalloc(&d_A, nnzA * sizeof(*d_A)));
    double *d_B;            gpuErrchk(cudaMalloc(&d_B, nnzB * sizeof(*d_B)));

    int *d_A_RowIndices;    gpuErrchk(cudaMalloc(&d_A_RowIndices, (N + 1) * sizeof(*d_A_RowIndices)));
    int *d_B_RowIndices;    gpuErrchk(cudaMalloc(&d_B_RowIndices, (N + 1) * sizeof(*d_B_RowIndices)));
    int *d_C_RowIndices;    gpuErrchk(cudaMalloc(&d_C_RowIndices, (N + 1) * sizeof(*d_C_RowIndices)));
    int *d_A_ColIndices;    gpuErrchk(cudaMalloc(&d_A_ColIndices, nnzA * sizeof(*d_A_ColIndices)));
    int *d_B_ColIndices;    gpuErrchk(cudaMalloc(&d_B_ColIndices, nnzB * sizeof(*d_B_ColIndices)));

    cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, d_A, d_A_RowIndices, d_A_ColIndices));
    cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrB, d_B_dense, lda, d_nnzPerVectorB, d_B, d_B_RowIndices, d_B_ColIndices));

    // --- Host side sparse matrices
    double *h_A = (double *)malloc(nnzA * sizeof(*h_A));        
    double *h_B = (double *)malloc(nnzB * sizeof(*h_B));        
    int *h_A_RowIndices = (int *)malloc((N + 1) * sizeof(*h_A_RowIndices));
    int *h_A_ColIndices = (int *)malloc(nnzA * sizeof(*h_A_ColIndices));
    int *h_B_RowIndices = (int *)malloc((N + 1) * sizeof(*h_B_RowIndices));
    int *h_B_ColIndices = (int *)malloc(nnzB * sizeof(*h_B_ColIndices));
    int *h_C_RowIndices = (int *)malloc((N + 1) * sizeof(*h_C_RowIndices));
    gpuErrchk(cudaMemcpy(h_A, d_A, nnzA * sizeof(*h_A), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (N + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnzA * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_B, d_B, nnzB * sizeof(*h_B), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_B_RowIndices, d_B_RowIndices, (N + 1) * sizeof(*h_B_RowIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_B_ColIndices, d_B_ColIndices, nnzB * sizeof(*h_B_ColIndices), cudaMemcpyDeviceToHost));

    printf("\nOriginal matrix A in CSR format\n\n");
    for (int i = 0; i < nnzA; ++i) printf("A[%i] = %f ", i, h_A[i]); printf("\n");

    printf("\nOriginal matrix B in CSR format\n\n");
    for (int i = 0; i < nnzB; ++i) printf("B[%i] = %f ", i, h_B[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < (N + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < (N + 1); ++i) printf("h_B_RowIndices[%i] = %i \n", i, h_B_RowIndices[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < nnzA; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);  

    printf("\n");
    for (int i = 0; i < nnzB; ++i) printf("h_B_ColIndices[%i] = %i \n", i, h_B_ColIndices[i]);  

    // --- Performing the matrix - matrix multiplication
    int baseC, nnzC = 0;
    // nnzTotalDevHostPtr points to host memory
    int *nnzTotalDevHostPtr = &nnzC;    

    cusparseSafeCall(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));

    cusparseSafeCall(cusparseXcsrgemmNnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, N, descrB, nnzB, 
                                         d_B_RowIndices, d_B_ColIndices, descrA, nnzA, d_A_RowIndices, d_A_ColIndices, descrC, d_C_RowIndices, 
                                         nnzTotalDevHostPtr));
    if (NULL != nnzTotalDevHostPtr) nnzC = *nnzTotalDevHostPtr;
    else {
        gpuErrchk(cudaMemcpy(&nnzC,  d_C_RowIndices + N, sizeof(int), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(&baseC, d_C_RowIndices,     sizeof(int), cudaMemcpyDeviceToHost));
        nnzC -= baseC;
    }
    int *d_C_ColIndices;    gpuErrchk(cudaMalloc(&d_C_ColIndices, nnzC * sizeof(int)));
    double *d_C;            gpuErrchk(cudaMalloc(&d_C, nnzC * sizeof(double)));
    double *h_C = (double *)malloc(nnzC * sizeof(*h_C));        
    int *h_C_ColIndices = (int *)malloc(nnzC * sizeof(*h_C_ColIndices));
    cusparseSafeCall(cusparseDcsrgemm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, N, descrB, nnzB,
                                      d_B, d_B_RowIndices, d_B_ColIndices, descrA, nnzA, d_A, d_A_RowIndices, d_A_ColIndices, descrC,
                                      d_C, d_C_RowIndices, d_C_ColIndices));

    cusparseSafeCall(cusparseDcsr2dense(handle, N, N, descrC, d_C, d_C_RowIndices, d_C_ColIndices, d_C_dense, N));

    gpuErrchk(cudaMemcpy(h_C ,           d_C,            nnzC * sizeof(*h_C), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_C_RowIndices, d_C_RowIndices, (N + 1) * sizeof(*h_C_RowIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_C_ColIndices, d_C_ColIndices, nnzC * sizeof(*h_C_ColIndices), cudaMemcpyDeviceToHost));

    printf("\nResult matrix C in CSR format\n\n");
    for (int i = 0; i < nnzC; ++i) printf("C[%i] = %f ", i, h_C[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < (N + 1); ++i) printf("h_C_RowIndices[%i] = %i \n", i, h_C_RowIndices[i]); printf("\n");

    printf("\n");
    for (int i = 0; i < nnzC; ++i) printf("h_C_ColIndices[%i] = %i \n", i, h_C_ColIndices[i]);  

    gpuErrchk(cudaMemcpy(h_C_dense, d_C_dense, N * N * sizeof(double), cudaMemcpyDeviceToHost));

    for (int j = 0; j < N; j++) {
        for (int i = 0; i < N; i++) 
            printf("%f \t", h_C_dense[i * N + j]);
        printf("\n");
        }
}