Question

我目前正在使用CUDA并尝试使用Ax = b和cuBLAS库来解决cuSPARSE。我查看了示例代码，包括conjugateGradient＆amp; conjugateGradientPrecond由NVIDIA提供。然而，共轭梯度法仅适用于正定矩阵，它是一种迭代方法。现在，我有一些通用的稀疏矩阵，我想我应该利用cuSPARSE库。有谁知道如何使用Ax = b和cuSPARSE库来解决cuBLAS？我找不到有用的API给我。通常，矩阵预计至少为1000x1000，在某些情况下，它会达到100000x100000。我应该使用直接方法吗？

Answer 1

在CUDA中解决一般稀疏线性系统的一种可能性是使用cuSOLVER。

cuSOLVER有三个有用的例程：

cusolverSpDcsrlsvlu，适用于 square 线性系统（未知数等于方程数），内部使用sparse LU factorization with partial pivoting;
cusolverSpDcsrlsvqr，适用于 square 线性系统（未知数等于方程数），内部使用sparse QR factorization;
cusolverSpDcsrlsqvqr，适用于矩形线性系统（与方程数不同的未知数），并在内部求解least square problem。

对于以上所有例程，支持的矩阵类型为CUSPARSE_MATRIX_TYPE_GENERAL。如果A是对称/ Hermitian并且仅使用下部/上部或有意义，则必须延长其缺失的上/下部分。

关注cusolverSpDcsrlsvlu

应注意两个输入参数：tol和reorder。关于前者，如果系统矩阵A是单数的，则U分解的矩阵LU的一些对角元素为零。如果|U(j,j)|<tol，算法决定为零。关于后者，cuSOLVER提供了重新排序以减少零填充，戏剧性地影响LU factorization的性能。 reorder在重新排序（reorder=1）或不重新排序（reorder=0）之间切换。

还应注意输出参数：singularity。 -1如果A是可逆的，则为j，否则它会提供U(j,j)=0的第一个索引cusolverSpDcsrlsvqr。

关注tol

应注意之前相同的输入/输出参数。特别是，reorder用于决定奇点，如果singularity是可逆的，-1无效，A为j，否则返回第一个索引R(j,j)=0，cusolverSpDcsrlsqvqr。

关注tol

应注意输入参数A，该参数用于确定rankA的等级。

还应注意输出参数A，它代表p，A的数字排名，一个长度等于{{min_norm列数的排列向量1}}（请参阅文档以获取更多详细信息）和||Ax - b||，这是剩余CUDA 10.0的标准。

目前，从cusolverSpDcsrlsvluHost开始，上述三个函数仅用于主机频道，这意味着它们尚未在GPU上运行。必须将它们称为：

cusolverSpDcsrlsvqrHost;
cusolverSpDcsrlsqvqrHost;
#include <stdio.h> #include <stdlib.h> #include <assert.h> #include <cusparse.h> #include <cusolverSp.h> /*******************/ /* iDivUp FUNCTION */ /*******************/ //extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); } __host__ __device__ int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); } /********************/ /* CUDA ERROR CHECK */ /********************/ // --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { if (code != cudaSuccess) { fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) { exit(code); } } } extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); } /**************************/ /* CUSOLVE ERROR CHECKING */ /**************************/ static const char *_cusolverGetErrorEnum(cusolverStatus_t error) { switch (error) { case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_SUCCESS"; case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED"; case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED"; case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE"; case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH"; case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED"; case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR"; case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; } return "<unknown>"; } inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line) { if (CUSOLVER_STATUS_SUCCESS != err) { fprintf(stderr, "CUSOLVE error in file '%s', line %d, error: %s \nterminating!\n", __FILE__, __LINE__, \ _cusolverGetErrorEnum(err)); \ assert(0); \ } } extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); } /***************************/ /* CUSPARSE ERROR CHECKING */ /***************************/ static const char *_cusparseGetErrorEnum(cusparseStatus_t error) { switch (error) { case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS"; case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED"; case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED"; case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE"; case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH"; case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR"; case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED"; case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR"; case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT"; } return "<unknown>"; } inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line) { if (CUSPARSE_STATUS_SUCCESS != err) { fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs", __FILE__, __LINE__, err, \ _cusparseGetErrorEnum(err)); \ cudaDeviceReset(); assert(0); \ } } extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); } /********/ /* MAIN */ /********/ int main() { // --- Initialize cuSPARSE cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle)); const int Nrows = 4; // --- Number of rows const int Ncols = 4; // --- Number of columns const int N = Nrows; // --- Host side dense matrix double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense)); // --- Column-major ordering h_A_dense[0] = 1.0f; h_A_dense[4] = 4.0f; h_A_dense[8] = 0.0f; h_A_dense[12] = 0.0f; h_A_dense[1] = 0.0f; h_A_dense[5] = 2.0f; h_A_dense[9] = 3.0f; h_A_dense[13] = 0.0f; h_A_dense[2] = 5.0f; h_A_dense[6] = 0.0f; h_A_dense[10] = 0.0f; h_A_dense[14] = 7.0f; h_A_dense[3] = 0.0f; h_A_dense[7] = 0.0f; h_A_dense[11] = 9.0f; h_A_dense[15] = 0.0f; //create device array and copy host to it double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense))); gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice)); // --- Descriptor for sparse matrix A cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA)); cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); int nnz = 0; // --- Number of nonzero elements in dense matrix const int lda = Nrows; // --- Leading dimension of dense matrix // --- Device side number of nonzero elements per row int *d_nnzPerVector; gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector))); cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz)); // --- Host side number of nonzero elements per row int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector)); gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost)); printf("Number of nonzero elements in dense matrix = %i\n\n", nnz); for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]); printf("\n"); // --- Device side dense matrix double *d_A; gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A))); int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices))); int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices))); cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices)); // --- Host side dense matrix double *h_A = (double *)malloc(nnz * sizeof(*h_A)); int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices)); int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices)); gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost)); gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost)); gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost)); for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n"); for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n"); for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]); // --- Allocating and defining dense host and device data vectors double *h_y = (double *)malloc(Nrows * sizeof(double)); h_y[0] = 100.0; h_y[1] = 200.0; h_y[2] = 400.0; h_y[3] = 500.0; double *d_y; gpuErrchk(cudaMalloc(&d_y, Nrows * sizeof(double))); gpuErrchk(cudaMemcpy(d_y, h_y, Nrows * sizeof(double), cudaMemcpyHostToDevice)); // --- Allocating the host and device side result vector double *h_x = (double *)malloc(Ncols * sizeof(double)); double *d_x; gpuErrchk(cudaMalloc(&d_x, Ncols * sizeof(double))); // --- CUDA solver initialization cusolverSpHandle_t solver_handle; cusolverSpCreate(&solver_handle); // --- Using LU factorization int singularity; cusolveSafeCall(cusolverSpDcsrlsvluHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity)); // --- Using QR factorization //cusolveSafeCall(cusolverSpDcsrlsvqrHost(solver_handle, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, 0, h_x, &singularity)); //int rankA; //int *p = (int *)malloc(N * sizeof(int)); //double min_norm; //cusolveSafeCall(cusolverSpDcsrlsqvqrHost(solver_handle, N, N, nnz, descrA, h_A, h_A_RowIndices, h_A_ColIndices, h_y, 0.000001, &rankA, h_x, p, &min_norm)); printf("Showing the results...\n"); for (int i = 0; i < N; i++) printf("%f\n", h_x[i]); }，

并且输入参数应该都驻留在主机上。

下面，请找到一个使用以上三种可能性的完整工作示例：

Window.postMessage

解决CUDA中的一般稀疏线性系统问题

1 个答案: