我正在使用CUDA和cusparse来解决许多线性方程 所以我正在使用' cusparseScsrsm_analysis'和' cusparseScsrsm_solve' 我没有CUDA错误,但输出错误,我无法弄清楚
这是矩阵(3x3): 1 0 2; 0 0 3; 4 5 6
右侧是: 4 -6 7
我期待结果是
8 -2.6 -2
但我得到的是
4 0 0
这是我使用的代码: -
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <cusparse_v2.h>
#include <cublas_v2.h>
#include <stdio.h>
int main()
{
//initialize our test cases
const int size = 3;
int nnz = 6 ;
const float alpha = 1.0;
//float values[] = {0,0,0,0} ;
float values[] = {1, 2, 3, 4, 5, 6} ;
int colIdx[] = {0, 2, 2, 0, 1, 2};
int rowPtr[] = {0, 2, 3, 6};
float x[] = {4,-6,7};
float y[3]= {0,0,0} ;
float *dev_values = 0 ;
int *dev_rowPtr = 0 ;
int *dev_colIdx = 0 ;
float *dev_x = 0 ;
float *dev_y = 0 ;
cusparseHandle_t handle;
cusparseCreate(&handle);
cusparseSolveAnalysisInfo_t info = 0;
cusparseCreateSolveAnalysisInfo(&info);
cusparseMatDescr_t descr = 0;
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
// Choose which GPU to run on, change this on a multi-GPU system.
cudaSetDevice(0);
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaError_t err ;
cudaEventRecord(start, 0);
// Allocate GPU buffers for three vectors (two input, one output) .
cudaMalloc((void**)&dev_x, size * sizeof(float));
cudaMalloc((void**)&dev_y, size * sizeof(float));
cudaMalloc((void**)&dev_values, nnz * sizeof(float));
cudaMalloc((void**)&dev_rowPtr, (size + 1) * sizeof(int));
cudaMalloc((void**)&dev_colIdx, nnz * sizeof(int));
//Memcpy
cudaMemcpyAsync(dev_x, x, size * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_values, values, nnz * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_rowPtr, rowPtr, (size + 1) * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_colIdx, colIdx, nnz * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync(dev_y, y, size * sizeof(float), cudaMemcpyHostToDevice);
cusparseScsrsm_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE , size, nnz,descr,
dev_values, dev_rowPtr, dev_colIdx, info);
cusparseScsrsm_solve(handle,CUSPARSE_OPERATION_NON_TRANSPOSE, size, size,
&alpha, descr, dev_values, dev_rowPtr, dev_colIdx,info, dev_x, size, dev_y, size);
cudaMemcpyAsync(y, dev_y, size*sizeof(float), cudaMemcpyDeviceToHost );
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf ("Time for the kernel: %f ms\n", time);
printf("%f\n",y[0]);
printf("%f\n",y[1]);
printf("%f\n",y[2]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaDeviceReset();
cusparseDestroySolveAnalysisInfo(info);
cusparseDestroy(handle);
cudaFree(dev_x);
cudaFree(dev_y);
cudaFree(dev_values);
cudaFree(dev_rowPtr);
cudaFree(dev_colIdx);
return 1;
}
答案 0 :(得分:1)
问题是cusparse只为稀疏的三角形线性系统提供了与输入矩阵不匹配的解决方案。 我用三角矩阵尝试了你的代码,输出是正确的。