Question

我正在使用CUDA和cusparse来解决许多线性方程所以我正在使用＆＃39; cusparseScsrsm_analysis＆＃39;和＆＃39; cusparseScsrsm_solve＆＃39; 我没有CUDA错误，但输出错误，我无法弄清楚

这是矩阵（3x3）： 1 0 2; 0 0 3; 4 5 6

右侧是： 4 -6 7

我期待结果是

8 -2.6 -2

但我得到的是

4 0 0

这是我使用的代码： -

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <cusparse_v2.h>
#include <cublas_v2.h>
#include <stdio.h>

int main()
{
    //initialize our test cases
    const int size = 3;
    int nnz = 6 ;
    const float alpha = 1.0;

    //float values[] = {0,0,0,0} ;
    float values[] = {1, 2, 3, 4, 5, 6} ;
    int colIdx[] = {0, 2, 2, 0, 1, 2};
    int rowPtr[] = {0, 2, 3, 6};

    float x[] = {4,-6,7};
    float y[3]= {0,0,0} ;

    float *dev_values = 0 ;
    int *dev_rowPtr = 0 ;
    int *dev_colIdx = 0 ;
    float *dev_x = 0 ;
    float *dev_y = 0 ;

    cusparseHandle_t handle;
    cusparseCreate(&handle);
    cusparseSolveAnalysisInfo_t info = 0;
    cusparseCreateSolveAnalysisInfo(&info);
    cusparseMatDescr_t descr = 0;

    cusparseCreateMatDescr(&descr);
    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaSetDevice(0);

    cudaEvent_t start, stop;
    float time;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaError_t err ;
    cudaEventRecord(start, 0);

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaMalloc((void**)&dev_x, size * sizeof(float));
    cudaMalloc((void**)&dev_y, size * sizeof(float));
    cudaMalloc((void**)&dev_values, nnz * sizeof(float));
    cudaMalloc((void**)&dev_rowPtr, (size + 1) * sizeof(int));
    cudaMalloc((void**)&dev_colIdx, nnz * sizeof(int));

    //Memcpy
    cudaMemcpyAsync(dev_x, x, size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dev_values, values, nnz * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dev_rowPtr, rowPtr, (size + 1) * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dev_colIdx, colIdx, nnz * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpyAsync(dev_y, y, size * sizeof(float), cudaMemcpyHostToDevice);

    cusparseScsrsm_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE , size, nnz,descr,
        dev_values, dev_rowPtr, dev_colIdx, info);

    cusparseScsrsm_solve(handle,CUSPARSE_OPERATION_NON_TRANSPOSE, size, size,
        &alpha, descr, dev_values, dev_rowPtr, dev_colIdx,info, dev_x, size, dev_y, size);

    cudaMemcpyAsync(y, dev_y, size*sizeof(float), cudaMemcpyDeviceToHost );

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf ("Time for the kernel: %f ms\n", time);

    printf("%f\n",y[0]);
    printf("%f\n",y[1]);
    printf("%f\n",y[2]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.

    cudaDeviceReset();
    cusparseDestroySolveAnalysisInfo(info);
    cusparseDestroy(handle);
    cudaFree(dev_x);
    cudaFree(dev_y);
    cudaFree(dev_values);
    cudaFree(dev_rowPtr);
    cudaFree(dev_colIdx);
    return 1;
}

Answer 1

问题是cusparse只为稀疏的三角形线性系统提供了与输入矩阵不匹配的解决方案。我用三角矩阵尝试了你的代码，输出是正确的。

cusparse错误的输出

1 个答案: