在分析过程中使用Fortran的CUDA cuSparse例程:CUSPARSE_INTERNAL_ERROR的BiCGStab

时间:2018-05-01 21:49:42

标签: cuda fortran

修改

这个问题被标记为偏离主题 “寻求调试帮助的问题(”为什么这段代码不起作用?“)必须包括所需的行为,特定的问题或错误以及在问题本身中重现它所需的最短代码......”

我在BiCGStab上做了两次破坏尝试,将下面的GitHub回购链接起来(错误来自* _analysis cuSparse函数)。由于这不能足够小,我已经删除了错误后发生的所有事情,并在此处包含了代码。由于NVidia没有提供fortran-to-c CUDA绑定(用fortran编写),因此该示例必须包含接口模块。

所需行为:没有从cuSparse分析例程返回CUSPARSE_INTERNAL_ERROR,以便我可以在fortran中实现BiCGStab。

特定错误:CUSPARSE_INTERNAL_ERROR,当使用cuda-memcheck运行时,此简化示例返回下面发布的第二个错误(convert_CsrToCoo中读取大小为4的32个实例超出范围)

!
! CUDA 
!
module cuda_cusolve_map_reduced

 interface

 ! cudaMemset
 integer (c_int) function cudaMemset( devPtr,value, count ) &
                              bind (C, name="cudaMemset" ) 
   use iso_c_binding
   implicit none
   type (c_ptr),value  :: devPtr
   integer(c_int), value :: value
   integer(c_size_t), value :: count
 end function cudaMemset
 ! cudaMalloc
 integer (c_int) function cudaMalloc ( buffer, size ) &
                              bind (C, name="cudaMalloc" ) 
   use iso_c_binding
   implicit none
   type (c_ptr)  :: buffer
   integer (c_size_t), value :: size
 end function cudaMalloc

 integer (c_int) function cudaMemcpy ( dst, src, count, kind ) &
                              bind (C, name="cudaMemcpy" )
   ! note: cudaMemcpyHostToDevice = 1
   ! note: cudaMemcpyDeviceToHost = 2
   ! note: cudaMemcpyDeviceToDevice = 3
   use iso_c_binding
   type (c_ptr), value :: dst, src
   integer (c_size_t), value :: count, kind
 end function cudaMemcpy

 ! cudaFree
 integer (c_int) function cudaFree(buffer)  bind(C, name="cudaFree")
   use iso_c_binding
   implicit none
   type (c_ptr), value :: buffer
 end function cudaFree

 integer (c_int) function cudaMemGetInfo(fre, tot) &
                              bind(C, name="cudaMemGetInfo")
   use iso_c_binding
   implicit none
   type(c_ptr),value :: fre
   type(c_ptr),value :: tot
 end function cudaMemGetInfo

 integer(c_int) function cusparseCreate(cusparseHandle) &
                             bind(C,name="cusparseCreate")

   use iso_c_binding
   implicit none
   type(c_ptr)::cusparseHandle
   end function cusparseCreate

 integer(c_int) function cudaStreamCreate(stream) &
                             bind(C,name="cudaStreamCreate")

 use iso_c_binding
 implicit none
 type(c_ptr)::stream
 end function cudaStreamCreate

 integer(c_int) function cusolverSpSetStream(handle,stream) &
                             bind(C,name="cusolverSpSetStream")

 use iso_c_binding
 implicit none
 type(c_ptr),value :: handle
 type(c_ptr),value :: stream
 end function cusolverSpSetStream

 integer(c_int) function cusparseSetStream(cusparseHandle,stream) &
                             bind(C,name="cusparseSetStream")

 use iso_c_binding
 implicit none
 type(c_ptr),value :: cusparseHandle
 type(c_ptr),value :: stream
 end function cusparseSetStream

 integer(c_int) function cusparseCreateMatDescr(descrA) &
                             bind(C,name="cusparseCreateMatDescr")

 use iso_c_binding
 implicit none
 type(c_ptr):: descrA
 end function cusparseCreateMatDescr


 integer(c_int) function cusparseSetMatType2(descrA,CUSPARSE_MATRIX_TYPE) &
                             bind(C,name="cusparseSetMatType")

 use iso_c_binding
 implicit none
 type(c_ptr), value:: descrA
 integer(c_int),value :: CUSPARSE_MATRIX_TYPE
 end function cusparseSetMatType2

 integer(c_int) function cusparseSetMatIndexBase2(descrA,CUSPARSE_INDEX_BASE) &
                             bind(C,name="cusparseSetMatIndexBase")

 use iso_c_binding
 implicit none
 type(c_ptr), value:: descrA
 integer(c_int),value :: CUSPARSE_INDEX_BASE
 end function cusparseSetMatIndexBase2

 integer(c_int) function cusparseSetMatFillMode(descrA,CUSPARSE_FILL_TYPE) &
                 bind(C,name="cusparseSetMatFillMode")

 use iso_c_binding
 implicit none
 type(c_ptr), value:: descrA
 integer(c_int),value :: CUSPARSE_FILL_TYPE
 end function cusparseSetMatFillMode

 integer(c_int) function cusparseSetMatDiagType(descrA,CUSPARSE_DIAG_TYPE) &
                 bind(C,name="cusparseSetMatDiagType")

 use iso_c_binding
 implicit none
 type(c_ptr), value:: descrA
 integer(c_int),value :: CUSPARSE_DIAG_TYPE
 end function cusparseSetMatDiagType


 integer(c_int) function cudaDeviceSynchronize() bind(C,name="cudaDeviceSynchronize")

 use iso_c_binding
 implicit none
 end function cudaDeviceSynchronize


 integer(c_int) function cusparseDestroy(cusparseHandle) bind(C,name="cusparseDestroy")

 use iso_c_binding
 implicit none
 type(c_ptr),value::cusparseHandle
 end function cusparseDestroy

 integer(c_int) function cudaStreamDestroy(stream) bind(C,name="cudaStreamDestroy")

 use iso_c_binding
 implicit none
 type(c_ptr),value :: stream
 end function cudaStreamDestroy

 integer(c_int) function cusparseDestroyMatDescr(descrA) bind(C,name="cusparseDestroyMatDescr")

 use iso_c_binding
 implicit none
 type(c_ptr), value:: descrA
 end function cusparseDestroyMatDescr

 integer(c_int) function cusparseCreateSolveAnalysisInfo(info) &
               bind(C,name="cusparseCreateSolveAnalysisInfo")

 use iso_c_binding
 implicit none
 type(c_ptr) :: info
 end function cusparseCreateSolveAnalysisInfo

 integer(c_int) function cusparseDcsrsv_analysis(handle,transA, &
                 m,nnz,descrA,csrValA,csrRowPtrA,csrColIndA,info) &
                 bind(C,name="cusparseDcsrsv_analysis")

 use iso_c_binding
 implicit none
 type(c_ptr), value :: handle
 integer(c_int), value :: transA
 integer(c_int), value :: m
 integer(c_int),value :: nnz
 type(c_ptr), value :: descrA
 type(c_ptr) :: csrValA
 type(c_ptr) :: csrRowPtrA
 type(c_ptr) :: csrColIndA
 type(c_ptr), value :: info
 end function cusparseDcsrsv_analysis

 integer(c_int) function cusparseDestroySolveAnalysisInfo(info) &
                 bind(C,name="cusparseDestroySolveAnalysisInfo")

 use iso_c_binding
 implicit none
 type(c_ptr),value::info
 end function cusparseDestroySolveAnalysisInfo

 end interface  

end module cuda_cusolve_map_reduced
!
!======================================================================
!======================================================================
  program main
   implicit none
   integer n,inz,i
   parameter (n=5)
   parameter (inz=13)
   double precision x(n),x_known(n),rhs(n),b(inz)
   integer ib(n+1),jb(inz)

   write(*,'(A)') 'Setting up test system'
   b(1) = 1.0d0;b(2) = 1.0d0;b(3) = 5.0d0;b(4) = 2.0d0
   b(5) = 1.0d0;b(6) = 3.0d0;b(7) = 2.0d0;b(8) = 1.0d0
   b(9) = 6.0d0;b(10) = 3.0d0;b(11) = 1.0d0;b(12) = 2.0d0
   b(13) = 1.0d0
   rhs(1) = 1.0d0;rhs(2) = 2.0d0;rhs(3) = 1.0d0
   rhs(4) = 3.0d0;rhs(5) = 0.0d0

   ib(1) = 1;ib(2) = 5;ib(3) = 7
   ib(4) = 9;ib(5) = 12;ib(6) = 14

   jb(1) = 1;jb(2) = 2;jb(3) = 4;jb(4) = 5
   jb(5) = 2;jb(6) = 3;jb(7) = 2;jb(8) = 3
   jb(9) = 1;jb(10) = 3;jb(11) = 4;jb(12) = 4
   jb(13) = 5

   x_known(1) = 0.08d0;x_known(2) = 0.2d0;x_known(3) = 0.6d0
   x_known(4) = 0.72d0;x_known(5) = -1.44d0
   x(1)=1.0d0;x(2)=1.0d0;x(3)=1.0d0
   x(4)=1.0d0;x(5)=1.0d0


  write(*,'(A)') 'Starting iterative solve'
  call cuda_BiCGStab_error(n,rhs,x,inz,ib,jb,b)
  write(*,'(A)') 'Found and Known solutions'
  do 23 i = 1,n
     write(*,*) x(i),x_known(i)
23  continue

  end program main
!
!=========================================================
subroutine cuda_BiCGStab_error(n,rhs,x,inz,ib,jb,b)
!=========================================================
use iso_c_binding
use cuda_cusolve_map_reduced
implicit none
integer n, inz
double precision x(n), rhs(n), b(inz)
target rhs,b,x
integer ib(n+1),jb(inz)
target ib,jb
integer ii,ierr,ierr2

integer, parameter :: dp = kind(1.d0)

type(c_ptr) :: cusparseHandle
type(c_ptr) :: stream
type(c_ptr) :: descrA
type(c_ptr) :: descrM
type(c_ptr) :: info_l
type(c_ptr) :: info_u
type(c_ptr) :: ArowsIndex
type(c_ptr) :: AcolsIndex
type(c_ptr) :: Aval
type(c_ptr) :: h_x  
type(c_ptr) :: h_rhs

! -------------------- pointers to device memory    
type(c_ptr) :: devPtrArowsIndex
type(c_ptr) :: devPtrAcolsIndex
type(c_ptr) :: devPtrAval
type(c_ptr) :: devPtrMrowsIndex
type(c_ptr) :: devPtrMcolsIndex
type(c_ptr) :: devPtrMval
type(c_ptr) :: devPtrX
type(c_ptr) :: devPtrF

integer*8 Arow1_i_size,Arow_d_size,Acol_d_size,Annz_i_size,Annz_d_size

integer*8 cudaMemcpyDeviceToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToDevice
integer*4 CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_TRI
parameter (cudaMemcpyHostToDevice=1)
parameter (cudaMemcpyDeviceToHost=2)
parameter (cudaMemcpyDeviceToDevice=3)
parameter (CUBLAS_OP_N=0)
parameter (CUBLAS_OP_T=1)
parameter (CUBLAS_OP_TRI=3)

ierr2 = 0

! define pointers to host memory
ArowsIndex = c_loc(ib)
AcolsIndex = c_loc(jb)
Aval = c_loc(b)
h_x  = c_loc(x)  ! x = A \ b
h_rhs  = c_loc(rhs)  ! b = ones(m,1)

Arow1_i_size=sizeof(ib(1:n+1))
Arow_d_size=sizeof(rhs(1:n))
Acol_d_size=sizeof(x(1:n))
Annz_i_size=sizeof(jb(1:inz))
Annz_d_size=sizeof(b(1:inz))

! Define the CUDA stream and matrix parameters
ierr = cusparseCreate(cusparseHandle)
ierr2 = ierr2 + ierr
ierr = cusparseCreateMatDescr(descrA)
ierr2 = ierr2 + ierr
ierr = cusparseCreateMatDescr(descrM)
ierr2 = ierr2 + ierr
ierr = cudaStreamCreate(stream) 
ierr2 = ierr2 + ierr
ierr = cusparseSetStream(cusparseHandle,stream) 
ierr2 = ierr2 + ierr
ierr = cusparseSetMatType2(descrA,CUBLAS_OP_N) 
ierr2 = ierr2 + ierr
ierr = cusparseSetMatIndexBase2(descrA,CUBLAS_OP_T) 
ierr2 = ierr2 + ierr
ierr = cusparseSetMatType2(descrM,CUBLAS_OP_N) 
ierr2 = ierr2 + ierr
ierr = cusparseSetMatIndexBase2(descrM,CUBLAS_OP_T) 
ierr2 = ierr2 + ierr
if (ierr2.ne.0) then
  write(*,'(A, I2)') 'Error during matrix setup ',ierr2
  stop
end if 
write(*,*) 'Allocating GPU memory'
ierr = cudaMalloc(devPtrX,Arow_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMalloc(devPtrF,Arow_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMalloc(devPtrAval,Annz_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMalloc(devPtrAcolsIndex,Annz_i_size)
ierr2 = ierr2 + ierr
ierr = cudaMalloc(devPtrArowsIndex,Arow1_i_size)
ierr2 = ierr2 + ierr
ierr = cudaMalloc(devPtrMval,Annz_d_size)
ierr2 = ierr2 + ierr
ierr = cudaDeviceSynchronize()
ierr2 = ierr2 + ierr
if (ierr2.ne.0) then
  write(*,'(A, I2)') 'Error during CUDA allocation: ',ierr2
  stop
end if 
write(*,*) 'Cleaning GPU memory'
ierr = cudaMemset(devPtrX,0,Arow_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMemset(devPtrF,0,Arow_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMemset(devPtrAval,0,Annz_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMemset(devPtrAcolsIndex,0,Annz_i_size)
ierr2 = ierr2 + ierr
ierr = cudaMemset(devPtrArowsIndex,0,Arow1_i_size)
ierr2 = ierr2 + ierr
ierr = cudaMemset(devPtrMval,0,Annz_d_size)
ierr2 = ierr2 + ierr
ierr = cudaDeviceSynchronize()
ierr2 = ierr2 + ierr
if (ierr2.ne.0) then
  write(*,'(A, I3)') 'Error during CUDA memory cleaning : ',ierr2
  stop
end if 

! transfer memory over to GPU
write(*,*) 'Transferring memory to GPU'
ierr = cudaMemcpy(devPtrArowsIndex,ArowsIndex,Arow1_i_size,cudaMemcpyHostToDevice)
ierr2 = ierr2 + ierr
ierr = cudaMemcpy(devPtrAcolsIndex,AcolsIndex,Annz_i_size,cudaMemcpyHostToDevice)
ierr2 = ierr2 + ierr
ierr = cudaMemcpy(devPtrAval,Aval,Annz_d_size,cudaMemcpyHostToDevice)
ierr2 = ierr2 + ierr
ierr = cudaMemcpy(devPtrMval,devPtrAval,Annz_d_size,cudaMemcpyDeviceToDevice)
ierr2 = ierr2 + ierr
ierr = cudaMemcpy(devPtrX,h_x,Arow_d_size,cudaMemcpyHostToDevice)
ierr2 = ierr2 + ierr
ierr = cudaMemcpy(devPtrF,h_rhs,Arow_d_size,cudaMemcpyHostToDevice)
ierr2 = ierr2 + ierr
ierr = cudaDeviceSynchronize()
ierr2 = ierr2 + ierr
if (ierr2 .ne. 0 ) then
    write (*, '(A, I2)') " Error during cuda memcpy ", ierr2
    stop
end if

write(*,*) 'Creating analysis for LU'
ierr = cusparseCreateSolveAnalysisInfo(info_l)
ierr2 = ierr2 + ierr
ierr = cusparseCreateSolveAnalysisInfo(info_u)
ierr2 = ierr2 + ierr
if (ierr2 .ne. 0 ) then
    write (*, '(A, I2)') " Error during LU analysis creation ", ierr2
    stop
end if

write(*,*) 'Analyzing L of LU'
ierr = cusparseSetMatFillMode(descrM,CUBLAS_OP_N)
ierr2 = ierr2 + ierr
ierr = cusparseSetMatDiagType(descrM,CUBLAS_OP_T)
ierr2 = ierr2 + ierr
ierr = cudaDeviceSynchronize()
ierr2 = ierr2 + ierr
ierr = cusparseDcsrsv_analysis(cusparseHandle,CUBLAS_OP_N,n,inz,descrM,devPtrAval,&
                               devPtrArowsIndex,devPtrAcolsIndex,info_l)
ierr2 = ierr2 + ierr
if (ierr2 .ne. 0 ) then
    write (*, '(A, I2)') " Error during L of LU analyzing sub2 ", ierr2
    stop
end if
ierr = cudaDeviceSynchronize()
ierr2 = ierr2 + ierr
if (ierr2 .ne. 0 ) then
    write (*, '(A, I2)') " Error during L of LU analyzing ", ierr2
    stop
end if

ierr = cudaFree(devPtrArowsIndex)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrAcolsIndex)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrAval)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrMrowsIndex)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrMcolsIndex)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrMval)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrX)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrF)
ierr2 = ierr2 + ierr
if (ierr2.ne.0) then
  write(*,'(A, I2)') 'Error during cudafree: ',ierr2
  stop
end if 

ierr = cusparseDestroy(cusparseHandle)
ierr2 = ierr2 + ierr
ierr = cudaStreamDestroy(stream)
ierr2 = ierr2 + ierr
ierr = cusparseDestroyMatDescr(descrA)
ierr2 = ierr2 + ierr
ierr = cusparseDestroyMatDescr(descrM)
ierr2 = ierr2 + ierr
ierr = cusparseDestroySolveAnalysisInfo(info_l)
ierr2 = ierr2 + ierr
ierr = cusparseDestroySolveAnalysisInfo(info_u)
ierr2 = ierr2 + ierr
if (ierr2.ne.0) then
  write(*,'(A, I2)') 'Error during cuda handle destruction: ',ierr2
  stop
end if 

return
end subroutine cuda_BiCGStab_error

结束编辑

我正在尝试将BiCGStab解算器方法的CUDA实现添加到传统的Fortran 77代码中,并增加了仅限于使用Fortran编译器的复杂性(CUDA函数的接口必须在Fortran中作为反对c / c ++)。后一种限制已被证明是一种额外的复杂因素,可能是我的问题的根源,但我的项目经理并没有对这一要求抱怨。我对Fortran很满意,但对CUDA来说实际上是一个新手,所以如果我错过了一个小细节或者有一个基本的误解,我一点也不会感到惊讶。

我的所有测试都是使用CUDA 9.1 Toolkit,iFort 17.0.4.196和Tesla P4 GPU完成的。

使用QR分解成功实现直接求解方法(有效地将CUDA样本cuSolverSp_LinearSolver.cpp转换为fortran)后,我在尝试实现迭代BiCGStab方法时遇到了问题(实际上是CUDA样本pbicgstab的翻译)的.cpp)。我在BiCGStab上的第一次尝试直接来自示例(使用cusparseDcsrilu0预处理器),第二次尝试使用domino-scheme cusparseDcsrilu02预处理器例程进行完整性检查。

在两个BiCGStab情况下,分析阶段(第一次尝试的cusparseDcsrsv_analysis和第二次尝试的cusparseDcsrilu02_analysis)返回一个我无法解析的CUSPARSE_INTERNAL_ERRROR标志。

我使用必要的文件制作了一个GitHub repo来形成BiCGStab方法和QR求解器方法的最小测试用例,使用带有13个非零的5x5矩阵和一个已知的解决方案。 QR工作,BiCGStab方法没有。

使用第二次BiCGStab尝试(cuda_BiCGStab2)运行cuda-memcheck会导致:

========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemsetAsync.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/nvidia/libcuda.so.1 [0x332863]
=========     Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x37f511]
=========     Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x29b7fd]
=========     Host Frame:test_cuda [0x68e9]
=========     Host Frame:test_cuda [0x3334]
=========     Host Frame:test_cuda [0x1f3e]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
=========     Host Frame:test_cuda [0x1e49]
========= 
  Error during csrilu02_analysis  7
========= ERROR SUMMARY: 1 error

在第一次尝试BiCGStab(cuda_BiCGStab)时运行cuda-memcheck导致32(增加线程ID)实例

========= Invalid __global__ read of size 4
=========     at 0x00000070 in void convert_CsrToCoo_kernel<int=1>(int const *, int, int, int*)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x0061e990 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib64/nvidia/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x23c06d]
=========     Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x34dabb]
=========     Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x36ad0e]
=========     Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x2f3339]
=========     Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 (cusparseXcsr2coo + 0x1fd) [0x2f355d]
=========     Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x2fa027]
=========     Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0xc4fa4]
=========     Host Frame:test_cuda [0xc9c0]
=========     Host Frame:test_cuda [0x2d6f]
=========     Host Frame:test_cuda [0x1f3e]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
=========     Host Frame:test_cuda [0x1e49]
=========
 Error during L of LU analyzing sub2  2
========= ERROR SUMMARY: 32 errors

后一行“......中的错误”来自我的代码并从CUDA函数打印返回的整数。如果没有cuda-memcheck,两个BiCGStab方法都返回值7,我将其解释为CUSPARSE_INTERNAL_ERROR但是当使用cuda-memcheck运行时,第一个BiCGStab尝试返回2。

任何有关解决此cusparse_internal_error的帮助,或者坦白说只是诊断提示,将不胜感激。

TL / DR:通过fortran接口使用cuSparse例程从Fortran实现的BiCGStab方法中诊断出CUSPARSE_INTERNAL_ERROR。 Internal_error来自cuSparse库中的* _analysis例程。我可能错过了一些小事,或者我可能有一个根本的误解。非常感谢任何输入/帮助。

1 个答案:

答案 0 :(得分:1)

是的,在github链接末尾隐藏在原始的2000+ LOC repro案例中,cusparseDcsrsv_analysis的接口定义中存在错误。它应该是

 integer(c_int) function cusparseDcsrsv_analysis(handle,transA, &
                 m,nnz,descrA,csrValA,csrRowPtrA,csrColIndA,info) &
                 bind(C,name="cusparseDcsrsv_analysis")

  use iso_c_binding
  implicit none
  type(c_ptr), value :: handle
  integer(c_int), value :: transA
  integer(c_int), value :: m
  integer(c_int),value :: nnz
  type(c_ptr), value :: descrA
  type(c_ptr), value :: csrValA
  type(c_ptr), value :: csrRowPtrA
  type(c_ptr), value :: csrColIndA
  type(c_ptr), value :: info
 end function cusparseDcsrsv_analysis

即。设备指针需要value属性才能正确传递给C子例程。

您可能在其他地方犯了这个错误,并且在您的代码库中的其他地方可能存在其他问题,但在修复了您在问题中编辑的MCVE中的明显错误后,我可以获得该repro案例的修改版本以正确运行。