Question

以下程序使用cuSPARSE测试密集到稀疏的转换。它在前几行输出中产生垃圾。但是，如果我将标有(2)的行移动到标有(1)的行后面的位置，则程序运行正常。有人能告诉我可能是什么原因吗？

修改：为了使演示更清晰，我用thrust重写了程序，同样的问题仍然存在。

修改：正如Robert所建议的那样，我将其更改回没有thrust的版本并添加了api级错误检查代码。

#include <iostream>
#include <cusparse_v2.h>

using std::cerr;
using std::cout;
using std::endl;

#define WRAP(x) do {x} while (0)
#define CHKcusparse(x) WRAP(                                        \
  cusparseStatus_t err = (x);                                       \
  if (err != CUSPARSE_STATUS_SUCCESS) {                             \
    cerr << "Cusparse Error #" << int(err) << "\"TODO\" at Line "   \
         << __LINE__ << " of " << __FILE__ << ": " << #x << endl;   \
    exit(1);                                                        \
  }                                                                 \
)
#define CHKcuda(x) WRAP(                                             \
  cudaError_t err = (x);                                             \
  if (err != cudaSuccess) {                                          \
    cerr << "Cuda Error #" << int(err) << ", \""                     \
         << cudaGetErrorString(err) << "\" at Line " << __LINE__     \
         << " of " << __FILE__ << ": " << #x << endl;                \
    exit(1);                                                         \
  }                                                                  \
)
#define ALLOC(X, T, N) do {                            \
  h##X = (T*) malloc(sizeof(T) * (N));                 \
  CHKcuda(cudaMalloc((void**)&d##X, sizeof(T) * (N))); \
} while(0)

int main() {
  srand(100);

  cusparseHandle_t g_cusparse_handle;
  CHKcusparse(cusparseCreate(&g_cusparse_handle));

  const int n = 100, in_degree = 10;
  int nnz = n * in_degree, nn = n * n;

  int *dnnz, *dridx, *dcols;
  int *hnnz, *hridx, *hcols;
  float *dvals, *dmat;
  float *hvals, *hmat;

  // (1) The number of non-zeros in each column.
  ALLOC(nnz, int, n);

  // The dense matrix.
  ALLOC(mat, float, nn);

  // The values in sparse matrix.
  ALLOC(vals, float, nnz);

  // (2) The row indices of the sparse matrix.
  ALLOC(ridx, int, nnz);

  // The column offsets of the sparse matrix.
  ALLOC(cols, int, n+1);

  // Fill and copy dense matrix and number of non-zeros.
  for (int i = 0; i < nn; i++) {hmat[i] = rand();}
  for (int i = 0; i < n; i++) {hnnz[i] = in_degree;}
  CHKcuda(cudaMemcpyAsync(dnnz, hnnz, sizeof(int) * n, cudaMemcpyHostToDevice));
  CHKcuda(cudaMemcpyAsync(dmat, hmat, sizeof(float) * nn, cudaMemcpyHostToDevice));
  CHKcuda(cudaDeviceSynchronize());

  // Perform dense to CSC format
  cusparseMatDescr_t cspMatDesc;
  CHKcusparse(cusparseCreateMatDescr(&cspMatDesc));
  CHKcusparse(cusparseSdense2csc(
      g_cusparse_handle, n, n, cspMatDesc, dmat, n,
      dnnz, dvals, dridx, dcols
  ));

  // Copy row indices back.
  CHKcuda(cudaMemcpyAsync(hridx, dridx, sizeof(int) * nnz, cudaMemcpyDeviceToHost));
  CHKcuda(cudaDeviceSynchronize());
  CHKcusparse(cusparseDestroyMatDescr(cspMatDesc));

  // Display row indices.
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < in_degree; j++) {
      std::cout << hridx[i * in_degree + j] << ", ";
    }
    std::cout << std::endl;
  }

  CHKcuda(cudaFree(dnnz));
  CHKcuda(cudaFree(dvals));
  CHKcuda(cudaFree(dridx));
  CHKcuda(cudaFree(dcols));
  CHKcuda(cudaFree(dmat));
  free(hnnz);
  free(hmat);
  free(hvals);
  free(hridx);
  free(hcols);
  return 0;
}

Answer 1

基本问题是您将内部不一致的数据传递给dense-to-sparse routine。您传递的是一个密集矩阵，每列有100个非零元素，但您告诉cusparse每列只有10个非零元素。

如果您使用cuda-memcheck运行代码，您会发现cusparse中存在错误。

对于此代码，您可以通过将in_degree变量更改为100来解决问题。

对于一般情况，cusparse提供a convenient routine来正确填充每列的非零元素数。

Answer 2

正如Robert Crovella已经强调的那样，function getPerms(str) { var permutations = []; var words = []; if(str.length == 0) { permutations.push(""); return permutations; } var first = str.charAt(0);//get the first char var reminder = str.slice(1);//remove the first char words = getPerms(reminder); for(var i = 0; i < words.length; i++) { for(var j = 0; j <= words[i].length; j++) { var s = insertCharAt(words[i], first, j); permutations.push(s); } } return permutations; } function insertCharAt(word, c, i) { var start = word.slice(0, i); var end = word.slice(i); var result = start + c + end; return result; } console.log(getPerms("abc"));和cuSPARSE例程可以使用cusparse<t>nnz()有效地执行从密集到稀疏的传递。反之亦然，可以通过cusparse<t>dense2csr()例程来完成。下面是一个完整的例子，展示了如何使用CSR格式的cusparse<t>csr2dense()从密集传递到稀疏，反之亦然。

<强> cuSparseUtilities.cuh

cuSPARSE

<强> cuSparseUtilities.cu

#ifndef CUSPARSEUTILITIES_CUH
#define CUSPARSEUTILITIES_CUH

#include "cusparse_v2.h"

void setUpDescriptor(cusparseMatDescr_t &, cusparseMatrixType_t, cusparseIndexBase_t);
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A,
    int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA,
    const cusparseHandle_t handle, const int Nrows, const int Ncols);

#endif

<强> kernel.cu

#include "cuSparseUtilities.cuh"
#include "Utilities.cuh"

/*****************************/
/* SETUP DESCRIPTOR FUNCTION */
/*****************************/
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase) {
    cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSafeCall(cusparseSetMatType(descrA, matrixType));
    cusparseSafeCall(cusparseSetMatIndexBase(descrA, indexBase));
}

/********************************************************/
/* DENSE TO SPARSE CONVERSION FOR REAL DOUBLE PRECISION */
/********************************************************/
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A, 
                   int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA, 
                   const cusparseHandle_t handle, const int Nrows, const int Ncols) {

    const int lda = Nrows;                      // --- Leading dimension of dense matrix

    gpuErrchk(cudaMalloc(&d_nnzPerVector[0], Nrows * sizeof(int)));

    // --- Compute the number of nonzero elements per row and the total number of nonzero elements in the dense d_A_dense
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], &nnz));

    // --- Device side sparse matrix
    gpuErrchk(cudaMalloc(&d_A[0], nnz * sizeof(double)));
    gpuErrchk(cudaMalloc(&d_A_RowIndices[0], (Nrows + 1) * sizeof(int)));
    gpuErrchk(cudaMalloc(&d_A_ColIndices[0], nnz * sizeof(int)));

    cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], d_A[0], d_A_RowIndices[0], d_A_ColIndices[0]));

}

使用cuSPARSE进行密集到稀疏和稀疏到密集的转换

2 个答案: