Cuda Fortran:从cpu到gpu的数据拷贝

时间:2014-01-14 00:23:09

标签: cuda fortran

我有关于数据复制表单主机到设备的问题。这是我的问题。我有一个数组定义为

 real, allocatable         :: cpuArray(:,:,:)
 real, device, allocatable :: gpuArrray(:,:,:)
 allocate(cpuArray(0:imax-1,0:jmax-1,0:kmax-1))
 allocate(gpuArrray(-1:imax,-1:jmax,-1:kmax))

 !array initialiazation 
 cpuArrray = randomValue  !non 0 value

 gpuArray = 0.0  !first 0 all gpu array elements 
 gpuArrray(0:imax-1,0:jmax-1,0:kmax-1)= cpuArray  

我的期望是,只有gpuArray中的指定索引才能从主机接收数据,但它不起作用。

你能帮我找到这个有什么问题吗?

PS:我的方法基于PGI主页tutorial

- 当我设置cpuArray和gpuArray相同的维度时, 我得到了正确的结果。

但是当前情况为gpuArray中的所有元素生成0。我将默认值修改为非零(即.gpuArray = 10.0!前10个所有gpu数组元素)但结果仍为0.

祝你好运, Adjeiinfo

1 个答案:

答案 0 :(得分:1)

我向整个社区道歉。我可以解决我的问题。这是我在测试程序中引入的一个愚蠢的错误。我在检查程序中没有cpuArrray= cpuArray(0:imax-1,0:jmax-1,0:kmax-1),而是cpuArrray= cpuArray。所以程序运行良好,但结果检查程序是错误的。

感谢您的跟进。

供参考,这是程序的一部分(可以构建和运行)

     module mytest
   use cudafor 
   implicit none
   integer :: imax , jmax, kmax
   integer :: i,j,k
   !host arrays 
   real,allocatable:: h_a(:,:,:)
   real,allocatable:: h_b(:,:,:)
   real,allocatable:: h_c(:,:,:)

   !device array 
   real,device,allocatable:: d_b(:,:,:)
   real,device,allocatable:: d_c(:,:,:)

   real,device,allocatable:: d_b_copy(:,:,:)
   real,device,allocatable:: d_c_copy(:,:,:)

   contains 
   attributes(global) subroutine testdata()
    integer :: d_i, d_j,d_k

    d_i = (blockIdx%x-1) * blockDim%x + threadIdx%x-1
    d_j = (blockIdx%y-1) * blockDim%y + threadIdx%y-1

    do d_k = 0, 1
        d_b_copy(d_i, d_j, d_k) = d_b(d_i, d_j, d_k)

        d_c_copy(d_i, d_j, d_k) = d_c(d_i, d_j, d_k)
    end do  
   end subroutine testdata

 end module mytest 

 program Test 
   use mytest
   type(dim3) :: dimGrid, dimBlock,dimGrid1, dimBlock1
   imax = 32
   jmax = 32 
   kmax = 2

   dimGrid =  dim3(2,2, 1)
   dimBlock = dim3(imax,jmax,1)
   allocate(h_a(0:imax-1,0:jmax-1,0:1))

   allocate(h_b(0:imax-1,0:jmax-1,0:1))
   allocate(h_c(0:imax-1,0:jmax-1,0:1))  

   !real,device,allocatable::d_c(:,:,:)   
   allocate(d_b(0:imax-1,0:jmax-1,0:1))
   allocate(d_c(-1:imax,-1:jmax,-1:16))

   allocate(d_b_copy(0:imax-1,0:jmax-1,0:1))
   allocate(d_c_copy(-1:imax,-1:jmax,-1:1))

   !array initialization 
   do k = 0,kmax-1
        do j=0, jmax-1
            do i = 0, imax-1
                h_a(i,j,k) = i*0.1
            end do
        end do
    end do
   !data transfer (cpu to gpu)  
   d_b = h_a  
   d_c(0:imax-1,0:jmax-1,0:kmax-1)= h_a


   call testdata<<<dimGrid,dimBlock>>>()
  !copy back to cpu 

  h_b = d_b_copy(0:imax-1,0:jmax-1,0:kmax-1)
  h_c = d_c_copy(0:imax-1,0:jmax-1,0:kmax-1)

   !just for visual test 
   write(*,*), h_b 
   open(24,file='h_a.dat')
   write(24,*) h_a
   close(24)

   open(24,file='d_b_copy.dat')
   write(24,*) h_b
   close(24)

   open(24,file='d_c_copy.dat')
   write(24,*) h_c
   close(24)

 end program Test