我正在尝试运行“Cuda-Fortran中的科学家和工程师”中的代码 但是在段错误中运行我不明白。
[mpi_rank_0][error_sighandler] Caught error: Segmentation fault (signal 11)
[mpi_rank_1][error_sighandler] Caught error: Segmentation fault (signal 11)
[mpi_rank_2][error_sighandler] Caught error: Segmentation fault (signal 11)
[mpi_rank_3][error_sighandler] Caught error: Segmentation fault (signal 11)
我的系统是64 linux,我有PGI编译器。 cuda驱动程序是4.0以下是我从书中获取的代码。我可以编译,但似乎MPI_Sendrecv不起作用。 我安装了MVAPICH2.1.8。 代码被复制了这个命令
/usr/.../mvapich/bin/mpif90 filename.cuf
根据您的评论进行编辑
当我使用-C选项构建时,编译失败
pgfortran-Info-Switch -Mvect -fast forces -O2
PGF90-S-0155-Kernel region ignored; see -Minfo messages (transposeMVA.cuf: 188)
transposempi:
140, Loop not vectorized/parallelized: contains call
146, Loop not vectorized/parallelized: contains call
157, Loop not vectorized/parallelized: contains call
190, Accelerator restriction: function/procedure calls are not supported
Loop not vectorized/parallelized: contains call
191, Accelerator restriction: function/procedure calls are not supported
217, all reduction inlined
Loop not vectorized/parallelized: contains call
0 inform, 0 warnings, 1 severes, 0 fatal for transposempi
当我删除-C选项时,编译通过但结果仍然相同。
/mpif90 -g -O0 -Minfo transposeMVA.cuf pgfortran-Info-Switch -Mvect -fast forces -O2
transposempi:
140, Generated vector sse code for the loop
146, Loop not vectorized: may not be beneficial
Unrolled inner loop 8 times
157, Memory copy idiom, loop replaced by call to __c_mcopy4
178, Loop not vectorized/parallelized: contains call
190, CUDA kernel generated
190, !$cuf kernel do <<< (*,*), (128,1) >>>
217, all reduction inlined
我将不胜感激任何帮助。
module transpose_m
implicit none
integer, parameter :: cudaTileDim = 32
integer, parameter :: blockRows = 8
contains
attributes(global) &
subroutine cudaTranspose(odata, ldo, idata, ldi)
real, intent(out) :: odata(ldo,*)
real, intent(in) :: idata(ldi,*)
integer, value, intent(in) :: ldo, ldi
real, shared :: tile(cudaTileDim+1, cudaTileDim)
integer :: x, y, j
x = (blockIdx%x-1) * cudaTileDim + threadIdx%x
y = (blockIdx%y-1) * cudaTileDim + threadIdx%y
do j = 0, cudaTileDim-1, blockRows
tile(threadIdx%x, threadIdx%y+j) = idata(x,y+j)
end do
call syncthreads()
x = (blockIdx%y-1) * cudaTileDim + threadIdx%x
y = (blockIdx%x-1) * cudaTileDim + threadIdx%y
do j = 0, cudaTileDim-1, blockRows
odata(x,y+j) = tile(threadIdx%y+j, threadIdx%x)
end do
end subroutine cudaTranspose
end module transpose_m
!
! Main code
!
program transposeMPI
use cudafor
use mpi
use transpose_m
implicit none
! global array size
integer, parameter :: nx = 2048, ny = 2048
! host arrays (global)
real :: h_idata(nx,ny), h_tdata(ny,nx), gold(ny,nx)
! CUDA vars and device arrays
integer :: deviceID
type (dim3) :: dimGrid, dimBlock
real, device, allocatable :: &
d_idata(:,:), d_tdata(:,:), d_sTile(:,:), d_rTile(:,:)
! MPI stuff
integer :: mpiTileDimX, mpiTileDimY
integer :: myrank, nprocs, tag, ierr, localRank
integer :: nstages, stage, sRank, rRank
integer :: status(MPI_STATUS_SIZE)
real(8) :: timeStart, timeStop
character (len=10) :: localRankStr
integer :: i, j, nyl, jl, jg, p
integer :: xOffset, yOffset
! for MVAPICH set device before MPI initialization
call get_environment_variable('MV2_COMM_WORLD_LOCAL_RANK', &
localRankStr)
read(localRankStr,'(i10)') localRank
ierr = cudaSetDevice(localRank)
! MPI initialization
call MPI_init(ierr)
call MPI_comm_rank(MPI_COMM_WORLD, myrank, ierr)
call MPI_comm_size(MPI_COMM_WORLD, nProcs, ierr)
! check parameters and calculate execution configuration
if (mod(nx,nProcs) == 0 .and. mod(ny,nProcs) == 0) then
mpiTileDimX = nx/nProcs
mpiTileDimY = ny/nProcs
else
write(*,*) 'ny must be an integral multiple of nProcs'
call MPI_Finalize(ierr)
stop
endif
if (mod(mpiTileDimX, cudaTileDim) /= 0 .or. &
mod(mpiTileDimY, cudaTileDim) /= 0) then
write(*,*) 'mpiTileDimX and mpitileDimY must be an ', &
'integral multiple of cudaTileDim'
call MPI_Finalize(ierr)
stop
end if
if (mod(cudaTileDim, blockRows) /= 0) then
write(*,*) 'cudaTileDim must be a multiple of blockRows'
call MPI_Finalize(ierr)
stop
end if
dimGrid = dim3(mpiTileDimX/cudaTileDim, &
mpiTileDimY/cudaTileDim, 1)
dimBlock = dim3(cudaTileDim, blockRows, 1)
! write parameters
if (myrank == 0) then
write(*,*)
write(*,"(/,'Array size: ', i0,'x',i0,/)") nx, ny
write(*,"('CUDA block size: ', i0,'x',i0, &
', CUDA tile size: ', i0,'x',i0)") &
cudaTileDim, blockRows, cudaTileDim, cudaTileDim
write(*,"('dimGrid: ', i0,'x',i0,'x',i0, &
', dimBlock: ', i0,'x',i0,'x',i0,/)") &
dimGrid%x, dimGrid%y, dimGrid%z, &
dimBlock%x, dimBlock%y, dimBlock%z
write(*,"('nprocs: ', i0, ', Local input array size: ', &
i0,'x',i0)") nprocs, nx, mpiTileDimY
write(*,"('mpiTileDim: ', i0,'x',i0,/)") &
mpiTileDimX, mpiTileDimY
endif
! initialize data
! host - each process has entire array on host (for now)
do p = 0, nProcs-1
do jl = 1, mpiTileDimY
jg = p*mpiTileDimY + jl
do i = 1, nx
h_idata(i,jg) = i+(jg-1)*nx
enddo
enddo
enddo
gold = transpose(h_idata)
! device - each process has
! nx*mpiTileDimY = ny*mpiTileDimX elements
allocate(d_idata(nx, mpiTileDimY), &
d_tdata(ny, mpiTileDimX), &
d_sTile(mpiTileDimX,mpiTileDimY), &
d_rTile(mpiTileDimX, mpiTileDimY))
yOffset = myrank*mpiTileDimY
d_idata(1:nx,1:mpiTileDimY) = &
h_idata(1:nx,yOffset+1:yOffset+mpiTileDimY)
d_tdata = -1.0
! ---------
! transpose
! ---------
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
timeStart = MPI_Wtime()
! 0th stage - local transpose
call cudaTranspose<<<dimGrid, dimBlock>>> &
(d_tdata(myrank*mpiTileDimY+1,1), ny, &
d_idata(myrank*mpiTileDimX+1,1), nx)
! other stages that involve MPI transfers
do stage = 1, nProcs-1
! sRank = the rank to which myrank sends data
! rRank = the rank from which myrank receives data
sRank = modulo(myrank-stage, nProcs)
rRank = modulo(myrank+stage, nProcs)
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
! pack tile so data to be sent is contiguous
!$cuf kernel do(2) <<<*,*>>>
do j = 1, mpiTileDimY
do i = 1, mpiTileDimX
d_sTile(i,j) = d_idata(sRank*mpiTileDimX+i,j)
enddo
enddo
call MPI_SENDRECV(d_sTile, mpiTileDimX*mpiTileDimY, &
MPI_REAL, sRank, myrank, &
d_rTile, mpiTileDimX*mpiTileDimY, MPI_REAL, &
rRank, rRank, MPI_COMM_WORLD, status, ierr)
! do transpose from receive tile into final array
! (no need to unpack)
call cudaTranspose<<<dimGrid, dimBlock>>> &
(d_tdata(rRank*mpiTileDimY+1,1), ny, &
d_rTile, mpiTileDimX)
end do ! stage
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
timeStop = MPI_Wtime()
! check results
h_tdata = d_tdata
xOffset = myrank*mpiTileDimX
if (all(h_tdata(1:ny,1:mpiTileDimX) == &
gold(1:ny, xOffset+1:xOffset+mpiTileDimX))) then
if (myrank == 0) then
write(*,"('Bandwidth (GB/s): ', f7.2,/)") &
2.*(nx*ny*4)/(1.0e+9*(timeStop-timeStart))
endif
else
write(*,"('[',i0,']', *** Failed ***,/)") myrank
endif
! cleanup
deallocate(d_idata, d_tdata, d_sTile, d_rTile)
call MPI_Finalize(ierr)
end program transposeMPI
答案 0 :(得分:1)
以下作品。谢谢罗伯特
h_sTile = d_sTile
call MPI_SENDRECV(h_sTile, mpiTileDimX*mpiTileDimY, &
MPI_REAL, sRank, myrank, &
h_rTile, mpiTileDimX*mpiTileDimY, MPI_REAL, &
rRank, rRank, MPI_COMM_WORLD, status, ierr)
!data to device device buffer
d_rTile = h_rTile
我需要获得正确的MPICH。 感谢您的帮助