我使用cuSPARSE
库对Nvidia K40上的稀疏矩阵 - 矩阵乘法进行基准测试。
我正在以CSR
格式创建自己的稀疏矩阵,并且我正在使用cusparseXcsrgemmNnz
库的cuSPARSE
例程。
但是,当我增加数据大小时,调用cusparseXcsrgemmNnz
时会发生错误,即不返回CUSPARSE_STATUS_SUCCESS
。
此外,cudaMemcpy
失败,并且未返回CUDA_SUCCESS
。
该代码适用于8 x 8
和16 x 16
矩阵。但是,从32 x 32
开始,会发现此错误。
修改:我从CUSPARSE_STATUS_EXECUTION_FAILED
收到cusparseXcsrgemmNnz
第三个矩阵大小。对于前两个矩阵大小,执行正确。
#include <cusparse_v2.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
double timerval()
{
struct timeval st;
gettimeofday(&st, NULL);
return (st.tv_sec+st.tv_usec*1e-6);
}
// perform sparse-matrix multiplication C=AxB
int main(){
double avg_time = 0, s_time, e_time;
cusparseStatus_t stat;
cusparseHandle_t hndl;
cusparseMatDescr_t descrA, descrB, descrC;
int *csrRowPtrA, *csrRowPtrB, *csrRowPtrC, *csrColIndA, *csrColIndB, *csrColIndC;
int *h_csrRowPtrA, *h_csrRowPtrB, *h_csrRowPtrC, *h_csrColIndA, *h_csrColIndB, *h_csrColIndC,*pos;
float *csrValA, *csrValB, *csrValC, *h_csrValA, *h_csrValB, *h_csrValC;
int nnzA, nnzB, nnzC;
int m=4,n,k,loop;
int i,j;
int iterations;
for (iterations=0;iterations<10;iterations++)
{
m *=2;
n = m;
k = m;
//density of the sparse matrix to be created. Assume 5% density.
double dense_const = 0.05;
int temp5, temp6,temp3,temp4;
int density=(m*n)*(dense_const);
nnzA = density;
nnzB = density;
h_csrRowPtrA = (int *)malloc((m+1)*sizeof(int));
h_csrRowPtrB = (int *)malloc((n+1)*sizeof(int));
h_csrColIndA = (int *)malloc(density*sizeof(int));
h_csrColIndB = (int *)malloc(density*sizeof(int));
h_csrValA = (float *)malloc(density*sizeof(float));
h_csrValB = (float *)malloc(density*sizeof(float));
if ((h_csrRowPtrA == NULL) || (h_csrRowPtrB == NULL) || (h_csrColIndA == NULL) || (h_csrColIndB == NULL) || (h_csrValA == NULL) || (h_csrValB == NULL))
{printf("malloc fail\n"); return -1;}
//position array for random initialisation of positions in input matrix
pos= (int *)calloc((m*n), sizeof(int));
int temp,temp1;
// printf("the density is %d\n",density);
// printf("check 1:\n");
//randomly initialise positions
for(i=0;i<density;i++)
{
temp1=rand()%(m*n);
pos[i]=temp1;
}
// printf("check 2:\n");
//sort the 'pos' array
for (i = 0 ; i < density; i++) {
int d = i;
int t;
while ( d > 0 && pos[d] < pos[d-1]) {
t = pos[d];
pos[d] = pos[d-1];
pos[d-1] = t;
d--;
}
}
// initialise with non zero elements and extract column and row ptr vector
j=1;
//ja[0]=1;
int p=0;
int f=0;
for(i = 0; i < density; i++)
{
temp=pos[i];
h_csrValA[f] = rand();
h_csrValB[f] = rand();
h_csrColIndA[f] = temp%m;
h_csrColIndB[f] = temp%m;
f++;
p++;
temp5= pos[i];
temp6=pos[i+1];
temp3=temp5-(temp5%m);
temp4=temp6-(temp6%m);
if(!(temp3== temp4))
{
if((temp3+m==temp6))
{}
else
{
h_csrRowPtrA[j]=p;
h_csrRowPtrB[j]=p;
j++;
}
}
}
// transfer data to device
cudaMalloc(&csrRowPtrA, (m+1)*sizeof(int));
cudaMalloc(&csrRowPtrB, (n+1)*sizeof(int));
cudaMalloc(&csrColIndA, density*sizeof(int));
cudaMalloc(&csrColIndB, density*sizeof(int));
cudaMalloc(&csrValA, density*sizeof(float));
cudaMalloc(&csrValB, density*sizeof(float));
cudaCheckErrors("cudaMalloc fail");
cudaMemcpy(csrRowPtrA, h_csrRowPtrA, (m+1)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrRowPtrB, h_csrRowPtrB, (n+1)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrColIndA, h_csrColIndA, density*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrColIndB, h_csrColIndB, density*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(csrValA, h_csrValA, density*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(csrValB, h_csrValB, density*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
// set cusparse matrix types
CUSPARSE_CHECK(cusparseCreate(&hndl));
stat = cusparseCreateMatDescr(&descrA);
CUSPARSE_CHECK(stat);
stat = cusparseCreateMatDescr(&descrB);
CUSPARSE_CHECK(stat);
stat = cusparseCreateMatDescr(&descrC);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
CUSPARSE_CHECK(stat);
stat = cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
CUSPARSE_CHECK(stat);
cusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
// figure out size of C
int baseC;
// nnzTotalDevHostPtr points to host memory
int *nnzTotalDevHostPtr = &nnzC;
stat = cusparseSetPointerMode(hndl, CUSPARSE_POINTER_MODE_HOST);
CUSPARSE_CHECK(stat);
cudaMalloc((void**)&csrRowPtrC, sizeof(int)*(m+1));
cudaCheckErrors("cudaMalloc fail");
s_time=timerval();
stat = cusparseXcsrgemmNnz(hndl, transA, transB, m, n, k,
descrA, nnzA, csrRowPtrA, csrColIndA,
descrB, nnzB, csrRowPtrB, csrColIndB,
descrC, csrRowPtrC, nnzTotalDevHostPtr );
CUSPARSE_CHECK(stat);
if (NULL != nnzTotalDevHostPtr){
nnzC = *nnzTotalDevHostPtr;}
else{
cudaMemcpy(&nnzC, csrRowPtrC+m, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&baseC, csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
nnzC -= baseC;}
cudaMalloc((void**)&csrColIndC, sizeof(int)*nnzC);
cudaMalloc((void**)&csrValC, sizeof(float)*nnzC);
cudaCheckErrors("cudaMalloc fail");
// perform multiplication C = A*B
for(loop=0;loop<1000;loop++)
{
stat = cusparseScsrgemm(hndl, transA, transB, m, n, k,
descrA, nnzA,
csrValA, csrRowPtrA, csrColIndA,
descrB, nnzB,
csrValB, csrRowPtrB, csrColIndB,
descrC,
csrValC, csrRowPtrC, csrColIndC);
CUSPARSE_CHECK(stat);
}
e_time=timerval();
avg_time=avg_time/1000;
// copy result (C) back to host
h_csrRowPtrC = (int *)malloc((m+1)*sizeof(int));
h_csrColIndC = (int *)malloc(nnzC *sizeof(int));
h_csrValC = (float *)malloc(nnzC *sizeof(float));
if ((h_csrRowPtrC == NULL) || (h_csrColIndC == NULL) || (h_csrValC == NULL))
{printf("malloc fail\n"); return -1;}
cudaMemcpy(h_csrRowPtrC, csrRowPtrC, (m+1)*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_csrColIndC, csrColIndC, nnzC*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_csrValC, csrValC, nnzC*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
printf ("\n Input size: %d x %d ,Time: %lf and density is %d \n", m,n, avg_time, density);
cudaFree(csrRowPtrC);
cudaFree(csrColIndC);
cudaFree(csrValC);
cudaFree(csrRowPtrA);
cudaFree(csrColIndA);
cudaFree(csrValA);
cudaFree(csrRowPtrB);
cudaFree(csrColIndB);
cudaFree(csrValB);
free(h_csrRowPtrC);
free(h_csrColIndC);
free(h_csrValC);
free(h_csrRowPtrA);
free(h_csrColIndA);
free(h_csrValA);
free(h_csrRowPtrB);
free(h_csrColIndB);
free(h_csrValB);
}
return 0;
}
答案 0 :(得分:3)
您似乎从here
中提取了部分代码如该帖子所示:
cusparseXcsrgemmNnz
中的失败可能表明CSR矩阵格式化存在潜在问题。
我很确定这是问题所在。生成格式正确的CSR matrix的过程已中断。
为了证明这一点,请在发布的代码中的指定评论之前添加以下代码:
printf("A RowPtrs: \n");
for (int i = 0; i < m+1; i++) printf("%d ", h_csrRowPtrA[i]);
printf("\nA ColInds: \n");
for (int i = 0; i < nnzA; i++) printf("%d ", h_csrColIndA[i]);
printf("\nB RowPtrs: \n");
for (int i = 0; i < n+1; i++) printf("%d ", h_csrRowPtrB[i]);
printf("\nB ColInds: \n");
for (int i = 0; i < nnzB; i++) printf("%d ", h_csrColIndB[i]);
printf("\n");
// add the above code before this comment:
// transfer data to device
当我这样做,重新编译并运行时,我得到的结果如下:
$ ./t730
A RowPtrs:
0 1 2 3 0 0 0 0 0
A ColInds:
6 7 1
B RowPtrs:
0 1 2 3 0 0 0 0 0
B ColInds:
6 7 1
Input size: 8 x 8 ,Time: 0.000000 and density is 3
A RowPtrs:
0 1 2 3 4 5 6 8 9 12 959542853 1886614883 1702064737 1299346243 1918980205 1232301409 1766154068
A ColInds:
11 6 4 12 11 10 2 13 3 2 8 11
B RowPtrs:
-1688500168 1 2 3 4 5 6 8 9 12 0 0 0 0 0 0 0
B ColInds:
11 6 4 12 11 10 2 13 3 2 8 11
cusparse fail: 6, line: 193
我们看到第一组CSR格式的A
和B
矩阵(包括以Input size: 8 x 8...
开头的行似乎完成但没有错误,但格式事实上已经破碎了。空行的行指针不是从零开始(不允许行指针向后移动),它们的行指针从最后一个填充行开始(这样每行的非零元素数等于当前行指针)减去前一行指针),和行指针序列中的最后一个值指向矩阵中最后一个元素之外的一个元素(即CSR行指针数组中的最后一个值是nnz,非零元素的数量)。
下一组A和B矩阵(对应于16x16通道)明显被打破。至少,A和B CSR格式的矩阵中的行指针明显超出范围。
您创建CSR矩阵的代码刚刚破解。我建议你研究CSR矩阵并创建一个工具来验证你将以这种方式随机创建的任何矩阵。 CUSP具有矩阵验证功能,我相信您可以使用其他CSR矩阵格式验证功能。
答案 1 :(得分:1)
按照Robert Crovella的回答,我想提供一个实现矩阵 - 矩阵稀疏乘法的全功能代码。
为了避免稀疏矩阵格式的任何歧义,代码从密集矩阵开始,并使用cusparse<t>dense2csr
将矩阵格式从密集转换为csr。
代码中涉及的两个矩阵是A
和B
。矩阵B
是一个置换矩阵。代码计算C = B * A
。
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include "Utilities.cuh"
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int N = 4; // --- Number of rows and columns
// --- Host side dense matrices
double *h_A_dense = (double*)malloc(N * N * sizeof(*h_A_dense));
double *h_B_dense = (double*)malloc(N * N * sizeof(*h_B_dense));
double *h_C_dense = (double*)malloc(N * N * sizeof(*h_C_dense));
// --- Column-major ordering
h_A_dense[0] = 0.4612; h_A_dense[4] = -0.0006; h_A_dense[8] = 0.3566; h_A_dense[12] = 0.0;
h_A_dense[1] = -0.0006; h_A_dense[5] = 0.4640; h_A_dense[9] = 0.0723; h_A_dense[13] = 0.0;
h_A_dense[2] = 0.3566; h_A_dense[6] = 0.0723; h_A_dense[10] = 0.7543; h_A_dense[14] = 0.0;
h_A_dense[3] = 0.; h_A_dense[7] = 0.0; h_A_dense[11] = 0.0; h_A_dense[15] = 0.1;
// --- Column-major ordering
h_B_dense[0] = 0.; h_B_dense[4] = 0.; h_B_dense[8] = 1.; h_B_dense[12] = 0.;
h_B_dense[1] = 1.; h_B_dense[5] = 0.; h_B_dense[9] = 0.; h_B_dense[13] = 0.;
h_B_dense[2] = 0.; h_B_dense[6] = 1.; h_B_dense[10] = 0.; h_B_dense[14] = 0.;
h_B_dense[3] = 0.; h_B_dense[7] = 0.; h_B_dense[11] = 0.; h_B_dense[15] = 1.;
// --- Create device arrays and copy host arrays to them
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, N * N * sizeof(*d_A_dense)));
double *d_B_dense; gpuErrchk(cudaMalloc(&d_B_dense, N * N * sizeof(*d_B_dense)));
double *d_C_dense; gpuErrchk(cudaMalloc(&d_C_dense, N * N * sizeof(*d_C_dense)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, N * N * sizeof(*d_A_dense), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_B_dense, h_B_dense, N * N * sizeof(*d_B_dense), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
// --- Descriptor for sparse matrix B
cusparseMatDescr_t descrB; cusparseSafeCall(cusparseCreateMatDescr(&descrB));
cusparseSafeCall(cusparseSetMatType (descrB, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ONE));
// --- Descriptor for sparse matrix C
cusparseMatDescr_t descrC; cusparseSafeCall(cusparseCreateMatDescr(&descrC));
cusparseSafeCall(cusparseSetMatType (descrC, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ONE));
int nnzA = 0; // --- Number of nonzero elements in dense matrix A
int nnzB = 0; // --- Number of nonzero elements in dense matrix B
const int lda = N; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row of matrix A
int *d_nnzPerVectorA; gpuErrchk(cudaMalloc(&d_nnzPerVectorA, N * sizeof(*d_nnzPerVectorA)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, &nnzA));
// --- Device side number of nonzero elements per row of matrix B
int *d_nnzPerVectorB; gpuErrchk(cudaMalloc(&d_nnzPerVectorB, N * sizeof(*d_nnzPerVectorB)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrB, d_B_dense, lda, d_nnzPerVectorB, &nnzB));
// --- Host side number of nonzero elements per row of matrix A
int *h_nnzPerVectorA = (int *)malloc(N * sizeof(*h_nnzPerVectorA));
gpuErrchk(cudaMemcpy(h_nnzPerVectorA, d_nnzPerVectorA, N * sizeof(*h_nnzPerVectorA), cudaMemcpyDeviceToHost));
// --- Host side number of nonzero elements per row of matrix B
int *h_nnzPerVectorB = (int *)malloc(N * sizeof(*h_nnzPerVectorB));
gpuErrchk(cudaMemcpy(h_nnzPerVectorB, d_nnzPerVectorB, N * sizeof(*h_nnzPerVectorB), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix A = %i\n\n", nnzA);
for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorA[i]);
printf("\n");
printf("Number of nonzero elements in dense matrix B = %i\n\n", nnzB);
for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorB[i]);
printf("\n");
// --- Device side sparse matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnzA * sizeof(*d_A)));
double *d_B; gpuErrchk(cudaMalloc(&d_B, nnzB * sizeof(*d_B)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (N + 1) * sizeof(*d_A_RowIndices)));
int *d_B_RowIndices; gpuErrchk(cudaMalloc(&d_B_RowIndices, (N + 1) * sizeof(*d_B_RowIndices)));
int *d_C_RowIndices; gpuErrchk(cudaMalloc(&d_C_RowIndices, (N + 1) * sizeof(*d_C_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnzA * sizeof(*d_A_ColIndices)));
int *d_B_ColIndices; gpuErrchk(cudaMalloc(&d_B_ColIndices, nnzB * sizeof(*d_B_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, d_A, d_A_RowIndices, d_A_ColIndices));
cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrB, d_B_dense, lda, d_nnzPerVectorB, d_B, d_B_RowIndices, d_B_ColIndices));
// --- Host side sparse matrices
double *h_A = (double *)malloc(nnzA * sizeof(*h_A));
double *h_B = (double *)malloc(nnzB * sizeof(*h_B));
int *h_A_RowIndices = (int *)malloc((N + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnzA * sizeof(*h_A_ColIndices));
int *h_B_RowIndices = (int *)malloc((N + 1) * sizeof(*h_B_RowIndices));
int *h_B_ColIndices = (int *)malloc(nnzB * sizeof(*h_B_ColIndices));
int *h_C_RowIndices = (int *)malloc((N + 1) * sizeof(*h_C_RowIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnzA * sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (N + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnzA * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_B, d_B, nnzB * sizeof(*h_B), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_B_RowIndices, d_B_RowIndices, (N + 1) * sizeof(*h_B_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_B_ColIndices, d_B_ColIndices, nnzB * sizeof(*h_B_ColIndices), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix A in CSR format\n\n");
for (int i = 0; i < nnzA; ++i) printf("A[%i] = %f ", i, h_A[i]); printf("\n");
printf("\nOriginal matrix B in CSR format\n\n");
for (int i = 0; i < nnzB; ++i) printf("B[%i] = %f ", i, h_B[i]); printf("\n");
printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_B_RowIndices[%i] = %i \n", i, h_B_RowIndices[i]); printf("\n");
printf("\n");
for (int i = 0; i < nnzA; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
printf("\n");
for (int i = 0; i < nnzB; ++i) printf("h_B_ColIndices[%i] = %i \n", i, h_B_ColIndices[i]);
// --- Performing the matrix - matrix multiplication
int baseC, nnzC = 0;
// nnzTotalDevHostPtr points to host memory
int *nnzTotalDevHostPtr = &nnzC;
cusparseSafeCall(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
cusparseSafeCall(cusparseXcsrgemmNnz(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, N, descrB, nnzB,
d_B_RowIndices, d_B_ColIndices, descrA, nnzA, d_A_RowIndices, d_A_ColIndices, descrC, d_C_RowIndices,
nnzTotalDevHostPtr));
if (NULL != nnzTotalDevHostPtr) nnzC = *nnzTotalDevHostPtr;
else {
gpuErrchk(cudaMemcpy(&nnzC, d_C_RowIndices + N, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(&baseC, d_C_RowIndices, sizeof(int), cudaMemcpyDeviceToHost));
nnzC -= baseC;
}
int *d_C_ColIndices; gpuErrchk(cudaMalloc(&d_C_ColIndices, nnzC * sizeof(int)));
double *d_C; gpuErrchk(cudaMalloc(&d_C, nnzC * sizeof(double)));
double *h_C = (double *)malloc(nnzC * sizeof(*h_C));
int *h_C_ColIndices = (int *)malloc(nnzC * sizeof(*h_C_ColIndices));
cusparseSafeCall(cusparseDcsrgemm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, N, descrB, nnzB,
d_B, d_B_RowIndices, d_B_ColIndices, descrA, nnzA, d_A, d_A_RowIndices, d_A_ColIndices, descrC,
d_C, d_C_RowIndices, d_C_ColIndices));
cusparseSafeCall(cusparseDcsr2dense(handle, N, N, descrC, d_C, d_C_RowIndices, d_C_ColIndices, d_C_dense, N));
gpuErrchk(cudaMemcpy(h_C , d_C, nnzC * sizeof(*h_C), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_C_RowIndices, d_C_RowIndices, (N + 1) * sizeof(*h_C_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_C_ColIndices, d_C_ColIndices, nnzC * sizeof(*h_C_ColIndices), cudaMemcpyDeviceToHost));
printf("\nResult matrix C in CSR format\n\n");
for (int i = 0; i < nnzC; ++i) printf("C[%i] = %f ", i, h_C[i]); printf("\n");
printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_C_RowIndices[%i] = %i \n", i, h_C_RowIndices[i]); printf("\n");
printf("\n");
for (int i = 0; i < nnzC; ++i) printf("h_C_ColIndices[%i] = %i \n", i, h_C_ColIndices[i]);
gpuErrchk(cudaMemcpy(h_C_dense, d_C_dense, N * N * sizeof(double), cudaMemcpyDeviceToHost));
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++)
printf("%f \t", h_C_dense[i * N + j]);
printf("\n");
}
}