我一直在研究Fortran代码,该代码使用cuBLAS批处理LU和cuSPARSE批处理三对角求解器作为带有ADI预处理器的BiCG迭代求解器的一部分。我使用具有计算能力3.5的Kepler K20X和CUDA 5.5。我没有PGI的CUDA Fortran,所以我正在编写自己的界面:
FUNCTION cublasDgetrfBatched(handle, n, dA, ldda, dP, dInfo, nbatch) BIND(C, NAME="cublasDgetrfBatched")
USE, INTRINSIC :: ISO_C_BINDING
INTEGER(KIND(CUBLAS_STATUS_SUCCESS)) :: cublasDgetrfBatched
TYPE(C_PTR), VALUE :: handle
INTEGER(C_INT), VALUE :: n
TYPE(C_PTR), VALUE :: dA
INTEGER(C_INT), VALUE :: ldda
TYPE(C_PTR), VALUE :: dP
TYPE(C_PTR), VALUE :: dInfo
INTEGER(C_INT), VALUE :: nbatch
END FUNCTION cublasDgetrfBatched
我使用cudaHostAlloc在主机上分配固定内存,为矩阵分配设备内存,并将包含设备指针的设备数组分配给矩阵,将每个矩阵异步复制到设备,执行操作,然后异步复制分解矩阵并转向主机以执行单一右侧的反向替换:
REAL(8), POINTER, DIMENSION(:,:,:) :: A
INTEGER, DIMENSION(:,:), POINTER :: ipiv
TYPE(C_PTR) :: cPtr_A, cPtr_ipiv
TYPE(C_PTR), ALLOCATABLE, DIMENSION(:), TARGET :: dPtr_A
TYPE(C_PTR) :: dPtr_ipiv, dPtr_A_d, dPtr_info
INTEGER(C_SIZE_T) :: sizeof_A, sizeof_ipiv
...
stat = cudaHostAlloc(cPtr_A, sizeof_A, cudaHostAllocDefault)
CALL C_F_POINTER(cPtr_A, A, (/m,m,nbatch/))
stat = cudaHostAlloc(cPtr_ipiv, sizeof_ipiv, cudaHostAllocDefault)
CALL C_F_POINTER(cPtr_ipiv, ipiv, (/m,nbatch/))
ALLOCATE(dPtr_A(nbatch))
DO ibatch=1,nbatch
stat = cudaMalloc(dPtr_A(ibatch), m*m*sizeof_double)
END DO
stat = cudaMalloc(dPtr_A_d, nbatch*sizeof_cptr)
stat = cublasSetVector(nbatch, sizeof_cptr, C_LOC(dPtr_A(1)), 1, dPtr_A_d, 1)
stat = cudaMalloc(dPtr_ipiv, m*nbatch*sizeof_cint)
stat = cudaMalloc(dPtr_info, nbatch*sizeof_cint)
...
!$OMP PARALLEL DEFAULT(shared) PRIVATE( stat, ibatch )
!$OMP DO
DO ibatch = 1,nbatch
stat = cublasSetMatrixAsync(m, m, sizeof_double, C_LOC(A(1,1,ibatch)), m, dPtr_A(ibatch), m, mystream)
END DO
!$OMP END DO
!$OMP END PARALLEL
...
stat = cublasDgetrfBatched(cublas_handle, m, dPtr_A_d, m, dPtr_ipiv, dPtr_info, nbatch)
...
stat = cublasGetMatrixAsync(m, nbatch, sizeof_cint, dPtr_ipiv, m, C_LOC(ipiv(1,1)), m, mystream)
!$OMP PARALLEL DEFAULT(shared) PRIVATE( ibatch, stat )
!$OMP DO
DO ibatch = 1,nbatch
stat = cublasGetMatrixAsync(m, m, sizeof_double, dPtr_A(ibatch), m, C_LOC(A(1,1,ibatch)), m, mystream)
END DO
!$OMP END DO
!$OMP END PARALLEL
...
!$OMP PARALLEL DEFAULT(shared) PRIVATE( ibatch, x, stat )
!$OMP DO
DO ibatch = 1,nbatch
x = rhs(:,ibatch)
CALL dgetrs( 'N', m, 1, A(1,1,ibatch), m, ipiv(1,ibatch), x(1), m, info )
rhs(:,ibatch) = x
END DO
!$OMP END DO
!$OMP END PARALLEL
...
我宁愿不必做最后一步,但是cublasDtrsmBatched例程将矩阵大小限制为32,并且我的大小为80(批量Dtrsv会更好,但这不存在) 。启动多个cublasDtrsv内核的成本使得在设备上执行back-sub无法维持。
我需要在调用cublasDgetrfBatched和cusparseDgtsvStridedBatch之间执行其他操作。其中大部分当前正在主机上执行,OpenMP用于在批处理级别并行化循环。一些操作,例如每个矩阵的矩阵向量乘法,例如,正在使用OpenACC在设备上计算:
!$ACC DATA COPYIN(A) COPYIN(x) COPYOUT(Ax)
...
!$ACC KERNELS
DO ibatch = 1,nbatch
DO i = 1,m
Ax(i,ibatch) = zero
END DO
DO j = 1,m
DO i = 1,m
Ax(i,ibatch) = Ax(i,ibatch) + A(i,j,ibatch)*x(j,ibatch)
END DO
END DO
END DO
!$ACC END KERNELS
...
!$ACC END DATA
我想在OpenACC上将更多的计算放在GPU上,但为了做到这一点,我需要能够将两者连接起来。如下所示:
!$ACC DATA COPYIN(A) CREATE(info,A_d) COPYOUT(ipiv)
!$ACC HOST_DATA USE_DEVICE(A)
DO ibatch = 1,nbatch
A_d(ibatch) = acc_deviceptr(A(1,1,ibatch))
END DO
!$ACC END HOST_DATA
...
!$ACC HOST_DATA USE_DEVICE(ipiv,info)
stat = cublasDgetrfBatched(cublas_handle, m, A_d, m, ipiv, info, nbatch)
!$ACC END HOST_DATA
...
!$ACC END DATA
我知道带有host_device子句的host_data构造在大多数情况下都是合适的,但是因为我需要实际传递给cuBLAS一个包含指向设备矩阵的指针的设备数组,所以我不知道怎么做继续进行。
任何人都可以提供任何见解吗?
由于
答案 0 :(得分:1)
!!把一切都放在设备上 !$ ACC DATA COPYIN(A)创建(info,A_d)COPYOUT(ipiv)
!!填充设备A_d数组 !$ ACC并行循环 DO ibatch = 1,nbatch A_d(ibatch)= A(1,1,ibatch) 结束了 !$ ACC结束并行
...
!!将A_d的设备地址发送给设备 !$ ACC HOST_DATA USE_DEVICE(A_d,ipiv,info) stat = cublasDgetrfBatched(cublas_handle,m,A_d,m,ipiv,info,nbatch) !$ ACC END HOST_DATA
...
!$ ACC END DATA
or
!!将除A_d之外的所有内容放在设备上 !$ ACC DATA COPYIN(A)创建(信息)COPYOUT(ipiv)
!!填充主机A_d数组 DO ibatch = 1,nbatch A_d(ibatch)= acc_deviceptr(A(1,1,ibatch)) 结束了
!!将A_d复制到设备 !$ acc data copyin(A_d) ...
!!将A_d和其他设备的设备地址发送给设备 !$ ACC HOST_DATA USE_DEVICE(A_d,ipiv,info) stat = cublasDgetrfBatched(cublas_handle,m,A_d,m,ipiv,info,nbatch) !$ ACC END HOST_DATA
... !$ acc结束数据
!$ ACC END DATA