修改
这个问题被标记为偏离主题 “寻求调试帮助的问题(”为什么这段代码不起作用?“)必须包括所需的行为,特定的问题或错误以及在问题本身中重现它所需的最短代码......”
我在BiCGStab上做了两次破坏尝试,将下面的GitHub回购链接起来(错误来自* _analysis cuSparse函数)。由于这不能足够小,我已经删除了错误后发生的所有事情,并在此处包含了代码。由于NVidia没有提供fortran-to-c CUDA绑定(用fortran编写),因此该示例必须包含接口模块。
所需行为:没有从cuSparse分析例程返回CUSPARSE_INTERNAL_ERROR,以便我可以在fortran中实现BiCGStab。
特定错误:CUSPARSE_INTERNAL_ERROR,当使用cuda-memcheck运行时,此简化示例返回下面发布的第二个错误(convert_CsrToCoo中读取大小为4的32个实例超出范围)
!
! CUDA
!
module cuda_cusolve_map_reduced
interface
! cudaMemset
integer (c_int) function cudaMemset( devPtr,value, count ) &
bind (C, name="cudaMemset" )
use iso_c_binding
implicit none
type (c_ptr),value :: devPtr
integer(c_int), value :: value
integer(c_size_t), value :: count
end function cudaMemset
! cudaMalloc
integer (c_int) function cudaMalloc ( buffer, size ) &
bind (C, name="cudaMalloc" )
use iso_c_binding
implicit none
type (c_ptr) :: buffer
integer (c_size_t), value :: size
end function cudaMalloc
integer (c_int) function cudaMemcpy ( dst, src, count, kind ) &
bind (C, name="cudaMemcpy" )
! note: cudaMemcpyHostToDevice = 1
! note: cudaMemcpyDeviceToHost = 2
! note: cudaMemcpyDeviceToDevice = 3
use iso_c_binding
type (c_ptr), value :: dst, src
integer (c_size_t), value :: count, kind
end function cudaMemcpy
! cudaFree
integer (c_int) function cudaFree(buffer) bind(C, name="cudaFree")
use iso_c_binding
implicit none
type (c_ptr), value :: buffer
end function cudaFree
integer (c_int) function cudaMemGetInfo(fre, tot) &
bind(C, name="cudaMemGetInfo")
use iso_c_binding
implicit none
type(c_ptr),value :: fre
type(c_ptr),value :: tot
end function cudaMemGetInfo
integer(c_int) function cusparseCreate(cusparseHandle) &
bind(C,name="cusparseCreate")
use iso_c_binding
implicit none
type(c_ptr)::cusparseHandle
end function cusparseCreate
integer(c_int) function cudaStreamCreate(stream) &
bind(C,name="cudaStreamCreate")
use iso_c_binding
implicit none
type(c_ptr)::stream
end function cudaStreamCreate
integer(c_int) function cusolverSpSetStream(handle,stream) &
bind(C,name="cusolverSpSetStream")
use iso_c_binding
implicit none
type(c_ptr),value :: handle
type(c_ptr),value :: stream
end function cusolverSpSetStream
integer(c_int) function cusparseSetStream(cusparseHandle,stream) &
bind(C,name="cusparseSetStream")
use iso_c_binding
implicit none
type(c_ptr),value :: cusparseHandle
type(c_ptr),value :: stream
end function cusparseSetStream
integer(c_int) function cusparseCreateMatDescr(descrA) &
bind(C,name="cusparseCreateMatDescr")
use iso_c_binding
implicit none
type(c_ptr):: descrA
end function cusparseCreateMatDescr
integer(c_int) function cusparseSetMatType2(descrA,CUSPARSE_MATRIX_TYPE) &
bind(C,name="cusparseSetMatType")
use iso_c_binding
implicit none
type(c_ptr), value:: descrA
integer(c_int),value :: CUSPARSE_MATRIX_TYPE
end function cusparseSetMatType2
integer(c_int) function cusparseSetMatIndexBase2(descrA,CUSPARSE_INDEX_BASE) &
bind(C,name="cusparseSetMatIndexBase")
use iso_c_binding
implicit none
type(c_ptr), value:: descrA
integer(c_int),value :: CUSPARSE_INDEX_BASE
end function cusparseSetMatIndexBase2
integer(c_int) function cusparseSetMatFillMode(descrA,CUSPARSE_FILL_TYPE) &
bind(C,name="cusparseSetMatFillMode")
use iso_c_binding
implicit none
type(c_ptr), value:: descrA
integer(c_int),value :: CUSPARSE_FILL_TYPE
end function cusparseSetMatFillMode
integer(c_int) function cusparseSetMatDiagType(descrA,CUSPARSE_DIAG_TYPE) &
bind(C,name="cusparseSetMatDiagType")
use iso_c_binding
implicit none
type(c_ptr), value:: descrA
integer(c_int),value :: CUSPARSE_DIAG_TYPE
end function cusparseSetMatDiagType
integer(c_int) function cudaDeviceSynchronize() bind(C,name="cudaDeviceSynchronize")
use iso_c_binding
implicit none
end function cudaDeviceSynchronize
integer(c_int) function cusparseDestroy(cusparseHandle) bind(C,name="cusparseDestroy")
use iso_c_binding
implicit none
type(c_ptr),value::cusparseHandle
end function cusparseDestroy
integer(c_int) function cudaStreamDestroy(stream) bind(C,name="cudaStreamDestroy")
use iso_c_binding
implicit none
type(c_ptr),value :: stream
end function cudaStreamDestroy
integer(c_int) function cusparseDestroyMatDescr(descrA) bind(C,name="cusparseDestroyMatDescr")
use iso_c_binding
implicit none
type(c_ptr), value:: descrA
end function cusparseDestroyMatDescr
integer(c_int) function cusparseCreateSolveAnalysisInfo(info) &
bind(C,name="cusparseCreateSolveAnalysisInfo")
use iso_c_binding
implicit none
type(c_ptr) :: info
end function cusparseCreateSolveAnalysisInfo
integer(c_int) function cusparseDcsrsv_analysis(handle,transA, &
m,nnz,descrA,csrValA,csrRowPtrA,csrColIndA,info) &
bind(C,name="cusparseDcsrsv_analysis")
use iso_c_binding
implicit none
type(c_ptr), value :: handle
integer(c_int), value :: transA
integer(c_int), value :: m
integer(c_int),value :: nnz
type(c_ptr), value :: descrA
type(c_ptr) :: csrValA
type(c_ptr) :: csrRowPtrA
type(c_ptr) :: csrColIndA
type(c_ptr), value :: info
end function cusparseDcsrsv_analysis
integer(c_int) function cusparseDestroySolveAnalysisInfo(info) &
bind(C,name="cusparseDestroySolveAnalysisInfo")
use iso_c_binding
implicit none
type(c_ptr),value::info
end function cusparseDestroySolveAnalysisInfo
end interface
end module cuda_cusolve_map_reduced
!
!======================================================================
!======================================================================
program main
implicit none
integer n,inz,i
parameter (n=5)
parameter (inz=13)
double precision x(n),x_known(n),rhs(n),b(inz)
integer ib(n+1),jb(inz)
write(*,'(A)') 'Setting up test system'
b(1) = 1.0d0;b(2) = 1.0d0;b(3) = 5.0d0;b(4) = 2.0d0
b(5) = 1.0d0;b(6) = 3.0d0;b(7) = 2.0d0;b(8) = 1.0d0
b(9) = 6.0d0;b(10) = 3.0d0;b(11) = 1.0d0;b(12) = 2.0d0
b(13) = 1.0d0
rhs(1) = 1.0d0;rhs(2) = 2.0d0;rhs(3) = 1.0d0
rhs(4) = 3.0d0;rhs(5) = 0.0d0
ib(1) = 1;ib(2) = 5;ib(3) = 7
ib(4) = 9;ib(5) = 12;ib(6) = 14
jb(1) = 1;jb(2) = 2;jb(3) = 4;jb(4) = 5
jb(5) = 2;jb(6) = 3;jb(7) = 2;jb(8) = 3
jb(9) = 1;jb(10) = 3;jb(11) = 4;jb(12) = 4
jb(13) = 5
x_known(1) = 0.08d0;x_known(2) = 0.2d0;x_known(3) = 0.6d0
x_known(4) = 0.72d0;x_known(5) = -1.44d0
x(1)=1.0d0;x(2)=1.0d0;x(3)=1.0d0
x(4)=1.0d0;x(5)=1.0d0
write(*,'(A)') 'Starting iterative solve'
call cuda_BiCGStab_error(n,rhs,x,inz,ib,jb,b)
write(*,'(A)') 'Found and Known solutions'
do 23 i = 1,n
write(*,*) x(i),x_known(i)
23 continue
end program main
!
!=========================================================
subroutine cuda_BiCGStab_error(n,rhs,x,inz,ib,jb,b)
!=========================================================
use iso_c_binding
use cuda_cusolve_map_reduced
implicit none
integer n, inz
double precision x(n), rhs(n), b(inz)
target rhs,b,x
integer ib(n+1),jb(inz)
target ib,jb
integer ii,ierr,ierr2
integer, parameter :: dp = kind(1.d0)
type(c_ptr) :: cusparseHandle
type(c_ptr) :: stream
type(c_ptr) :: descrA
type(c_ptr) :: descrM
type(c_ptr) :: info_l
type(c_ptr) :: info_u
type(c_ptr) :: ArowsIndex
type(c_ptr) :: AcolsIndex
type(c_ptr) :: Aval
type(c_ptr) :: h_x
type(c_ptr) :: h_rhs
! -------------------- pointers to device memory
type(c_ptr) :: devPtrArowsIndex
type(c_ptr) :: devPtrAcolsIndex
type(c_ptr) :: devPtrAval
type(c_ptr) :: devPtrMrowsIndex
type(c_ptr) :: devPtrMcolsIndex
type(c_ptr) :: devPtrMval
type(c_ptr) :: devPtrX
type(c_ptr) :: devPtrF
integer*8 Arow1_i_size,Arow_d_size,Acol_d_size,Annz_i_size,Annz_d_size
integer*8 cudaMemcpyDeviceToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToDevice
integer*4 CUBLAS_OP_N, CUBLAS_OP_T, CUBLAS_OP_TRI
parameter (cudaMemcpyHostToDevice=1)
parameter (cudaMemcpyDeviceToHost=2)
parameter (cudaMemcpyDeviceToDevice=3)
parameter (CUBLAS_OP_N=0)
parameter (CUBLAS_OP_T=1)
parameter (CUBLAS_OP_TRI=3)
ierr2 = 0
! define pointers to host memory
ArowsIndex = c_loc(ib)
AcolsIndex = c_loc(jb)
Aval = c_loc(b)
h_x = c_loc(x) ! x = A \ b
h_rhs = c_loc(rhs) ! b = ones(m,1)
Arow1_i_size=sizeof(ib(1:n+1))
Arow_d_size=sizeof(rhs(1:n))
Acol_d_size=sizeof(x(1:n))
Annz_i_size=sizeof(jb(1:inz))
Annz_d_size=sizeof(b(1:inz))
! Define the CUDA stream and matrix parameters
ierr = cusparseCreate(cusparseHandle)
ierr2 = ierr2 + ierr
ierr = cusparseCreateMatDescr(descrA)
ierr2 = ierr2 + ierr
ierr = cusparseCreateMatDescr(descrM)
ierr2 = ierr2 + ierr
ierr = cudaStreamCreate(stream)
ierr2 = ierr2 + ierr
ierr = cusparseSetStream(cusparseHandle,stream)
ierr2 = ierr2 + ierr
ierr = cusparseSetMatType2(descrA,CUBLAS_OP_N)
ierr2 = ierr2 + ierr
ierr = cusparseSetMatIndexBase2(descrA,CUBLAS_OP_T)
ierr2 = ierr2 + ierr
ierr = cusparseSetMatType2(descrM,CUBLAS_OP_N)
ierr2 = ierr2 + ierr
ierr = cusparseSetMatIndexBase2(descrM,CUBLAS_OP_T)
ierr2 = ierr2 + ierr
if (ierr2.ne.0) then
write(*,'(A, I2)') 'Error during matrix setup ',ierr2
stop
end if
write(*,*) 'Allocating GPU memory'
ierr = cudaMalloc(devPtrX,Arow_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMalloc(devPtrF,Arow_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMalloc(devPtrAval,Annz_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMalloc(devPtrAcolsIndex,Annz_i_size)
ierr2 = ierr2 + ierr
ierr = cudaMalloc(devPtrArowsIndex,Arow1_i_size)
ierr2 = ierr2 + ierr
ierr = cudaMalloc(devPtrMval,Annz_d_size)
ierr2 = ierr2 + ierr
ierr = cudaDeviceSynchronize()
ierr2 = ierr2 + ierr
if (ierr2.ne.0) then
write(*,'(A, I2)') 'Error during CUDA allocation: ',ierr2
stop
end if
write(*,*) 'Cleaning GPU memory'
ierr = cudaMemset(devPtrX,0,Arow_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMemset(devPtrF,0,Arow_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMemset(devPtrAval,0,Annz_d_size)
ierr2 = ierr2 + ierr
ierr = cudaMemset(devPtrAcolsIndex,0,Annz_i_size)
ierr2 = ierr2 + ierr
ierr = cudaMemset(devPtrArowsIndex,0,Arow1_i_size)
ierr2 = ierr2 + ierr
ierr = cudaMemset(devPtrMval,0,Annz_d_size)
ierr2 = ierr2 + ierr
ierr = cudaDeviceSynchronize()
ierr2 = ierr2 + ierr
if (ierr2.ne.0) then
write(*,'(A, I3)') 'Error during CUDA memory cleaning : ',ierr2
stop
end if
! transfer memory over to GPU
write(*,*) 'Transferring memory to GPU'
ierr = cudaMemcpy(devPtrArowsIndex,ArowsIndex,Arow1_i_size,cudaMemcpyHostToDevice)
ierr2 = ierr2 + ierr
ierr = cudaMemcpy(devPtrAcolsIndex,AcolsIndex,Annz_i_size,cudaMemcpyHostToDevice)
ierr2 = ierr2 + ierr
ierr = cudaMemcpy(devPtrAval,Aval,Annz_d_size,cudaMemcpyHostToDevice)
ierr2 = ierr2 + ierr
ierr = cudaMemcpy(devPtrMval,devPtrAval,Annz_d_size,cudaMemcpyDeviceToDevice)
ierr2 = ierr2 + ierr
ierr = cudaMemcpy(devPtrX,h_x,Arow_d_size,cudaMemcpyHostToDevice)
ierr2 = ierr2 + ierr
ierr = cudaMemcpy(devPtrF,h_rhs,Arow_d_size,cudaMemcpyHostToDevice)
ierr2 = ierr2 + ierr
ierr = cudaDeviceSynchronize()
ierr2 = ierr2 + ierr
if (ierr2 .ne. 0 ) then
write (*, '(A, I2)') " Error during cuda memcpy ", ierr2
stop
end if
write(*,*) 'Creating analysis for LU'
ierr = cusparseCreateSolveAnalysisInfo(info_l)
ierr2 = ierr2 + ierr
ierr = cusparseCreateSolveAnalysisInfo(info_u)
ierr2 = ierr2 + ierr
if (ierr2 .ne. 0 ) then
write (*, '(A, I2)') " Error during LU analysis creation ", ierr2
stop
end if
write(*,*) 'Analyzing L of LU'
ierr = cusparseSetMatFillMode(descrM,CUBLAS_OP_N)
ierr2 = ierr2 + ierr
ierr = cusparseSetMatDiagType(descrM,CUBLAS_OP_T)
ierr2 = ierr2 + ierr
ierr = cudaDeviceSynchronize()
ierr2 = ierr2 + ierr
ierr = cusparseDcsrsv_analysis(cusparseHandle,CUBLAS_OP_N,n,inz,descrM,devPtrAval,&
devPtrArowsIndex,devPtrAcolsIndex,info_l)
ierr2 = ierr2 + ierr
if (ierr2 .ne. 0 ) then
write (*, '(A, I2)') " Error during L of LU analyzing sub2 ", ierr2
stop
end if
ierr = cudaDeviceSynchronize()
ierr2 = ierr2 + ierr
if (ierr2 .ne. 0 ) then
write (*, '(A, I2)') " Error during L of LU analyzing ", ierr2
stop
end if
ierr = cudaFree(devPtrArowsIndex)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrAcolsIndex)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrAval)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrMrowsIndex)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrMcolsIndex)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrMval)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrX)
ierr2 = ierr2 + ierr
ierr = cudaFree(devPtrF)
ierr2 = ierr2 + ierr
if (ierr2.ne.0) then
write(*,'(A, I2)') 'Error during cudafree: ',ierr2
stop
end if
ierr = cusparseDestroy(cusparseHandle)
ierr2 = ierr2 + ierr
ierr = cudaStreamDestroy(stream)
ierr2 = ierr2 + ierr
ierr = cusparseDestroyMatDescr(descrA)
ierr2 = ierr2 + ierr
ierr = cusparseDestroyMatDescr(descrM)
ierr2 = ierr2 + ierr
ierr = cusparseDestroySolveAnalysisInfo(info_l)
ierr2 = ierr2 + ierr
ierr = cusparseDestroySolveAnalysisInfo(info_u)
ierr2 = ierr2 + ierr
if (ierr2.ne.0) then
write(*,'(A, I2)') 'Error during cuda handle destruction: ',ierr2
stop
end if
return
end subroutine cuda_BiCGStab_error
结束编辑
我正在尝试将BiCGStab解算器方法的CUDA实现添加到传统的Fortran 77代码中,并增加了仅限于使用Fortran编译器的复杂性(CUDA函数的接口必须在Fortran中作为反对c / c ++)。后一种限制已被证明是一种额外的复杂因素,可能是我的问题的根源,但我的项目经理并没有对这一要求抱怨。我对Fortran很满意,但对CUDA来说实际上是一个新手,所以如果我错过了一个小细节或者有一个基本的误解,我一点也不会感到惊讶。
我的所有测试都是使用CUDA 9.1 Toolkit,iFort 17.0.4.196和Tesla P4 GPU完成的。
使用QR分解成功实现直接求解方法(有效地将CUDA样本cuSolverSp_LinearSolver.cpp转换为fortran)后,我在尝试实现迭代BiCGStab方法时遇到了问题(实际上是CUDA样本pbicgstab的翻译)的.cpp)。我在BiCGStab上的第一次尝试直接来自示例(使用cusparseDcsrilu0预处理器),第二次尝试使用domino-scheme cusparseDcsrilu02预处理器例程进行完整性检查。
在两个BiCGStab情况下,分析阶段(第一次尝试的cusparseDcsrsv_analysis和第二次尝试的cusparseDcsrilu02_analysis)返回一个我无法解析的CUSPARSE_INTERNAL_ERRROR标志。
我使用必要的文件制作了一个GitHub repo来形成BiCGStab方法和QR求解器方法的最小测试用例,使用带有13个非零的5x5矩阵和一个已知的解决方案。 QR工作,BiCGStab方法没有。
使用第二次BiCGStab尝试(cuda_BiCGStab2)运行cuda-memcheck会导致:
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemsetAsync.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib64/nvidia/libcuda.so.1 [0x332863]
========= Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x37f511]
========= Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x29b7fd]
========= Host Frame:test_cuda [0x68e9]
========= Host Frame:test_cuda [0x3334]
========= Host Frame:test_cuda [0x1f3e]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
========= Host Frame:test_cuda [0x1e49]
=========
Error during csrilu02_analysis 7
========= ERROR SUMMARY: 1 error
在第一次尝试BiCGStab(cuda_BiCGStab)时运行cuda-memcheck导致32(增加线程ID)实例
========= Invalid __global__ read of size 4
========= at 0x00000070 in void convert_CsrToCoo_kernel<int=1>(int const *, int, int, int*)
========= by thread (0,0,0) in block (0,0,0)
========= Address 0x0061e990 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib64/nvidia/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x23c06d]
========= Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x34dabb]
========= Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x36ad0e]
========= Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x2f3339]
========= Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 (cusparseXcsr2coo + 0x1fd) [0x2f355d]
========= Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0x2fa027]
========= Host Frame:/usr/local/cuda-9.1/targets/x86_64-linux/lib/libcusparse.so.9.1 [0xc4fa4]
========= Host Frame:test_cuda [0xc9c0]
========= Host Frame:test_cuda [0x2d6f]
========= Host Frame:test_cuda [0x1f3e]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xfd) [0x1ed1d]
========= Host Frame:test_cuda [0x1e49]
=========
Error during L of LU analyzing sub2 2
========= ERROR SUMMARY: 32 errors
后一行“......中的错误”来自我的代码并从CUDA函数打印返回的整数。如果没有cuda-memcheck,两个BiCGStab方法都返回值7,我将其解释为CUSPARSE_INTERNAL_ERROR但是当使用cuda-memcheck运行时,第一个BiCGStab尝试返回2。
任何有关解决此cusparse_internal_error的帮助,或者坦白说只是诊断提示,将不胜感激。
TL / DR:通过fortran接口使用cuSparse例程从Fortran实现的BiCGStab方法中诊断出CUSPARSE_INTERNAL_ERROR。 Internal_error来自cuSparse库中的* _analysis例程。我可能错过了一些小事,或者我可能有一个根本的误解。非常感谢任何输入/帮助。
答案 0 :(得分:1)
是的,在github链接末尾隐藏在原始的2000+ LOC repro案例中,cusparseDcsrsv_analysis
的接口定义中存在错误。它应该是
integer(c_int) function cusparseDcsrsv_analysis(handle,transA, &
m,nnz,descrA,csrValA,csrRowPtrA,csrColIndA,info) &
bind(C,name="cusparseDcsrsv_analysis")
use iso_c_binding
implicit none
type(c_ptr), value :: handle
integer(c_int), value :: transA
integer(c_int), value :: m
integer(c_int),value :: nnz
type(c_ptr), value :: descrA
type(c_ptr), value :: csrValA
type(c_ptr), value :: csrRowPtrA
type(c_ptr), value :: csrColIndA
type(c_ptr), value :: info
end function cusparseDcsrsv_analysis
即。设备指针需要value
属性才能正确传递给C子例程。
您可能在其他地方犯了这个错误,并且在您的代码库中的其他地方可能存在其他问题,但在修复了您在问题中编辑的MCVE中的明显错误后,我可以获得该repro案例的修改版本以正确运行。