我想在节点上的MPI排名和gpu之间创建一个上下文,并为多个节点执行此操作。
我找到了一个代码here,它在C中。我在Fortran工作,因此我试图将此代码转换为Fortran。
C代码如下:
#include <mpi.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
int stringCmp( const void *a, const void *b)
{
return strcmp(a,b);
}
void assignDeviceToProcess(int *p2myrank)
{
char host_name[MPI_MAX_PROCESSOR_NAME];
char (*host_names)[MPI_MAX_PROCESSOR_NAME];
MPI_Comm nodeComm;
int i, n, namelen, color, rank, nprocs, myrank,gpu_per_node;
size_t bytes;
int dev, err1;
struct cudaDeviceProp deviceProp;
/* Check if the device has been alreasy assigned */
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Get_processor_name(host_name,&namelen);
bytes = nprocs * sizeof(char[MPI_MAX_PROCESSOR_NAME]);
host_names = (char (*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);
strcpy(host_names[rank], host_name);
for (n=0; n<nprocs; n++)
{
MPI_Bcast(&(host_names[n]),MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
}
qsort(host_names, nprocs, sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);
color = 0;
for (n=0; n<nprocs; n++)
{
if(n>0&&strcmp(host_names[n-1], host_names[n])) color++;
if(strcmp(host_name, host_names[n]) == 0) break;
}
MPI_Comm_split(MPI_COMM_WORLD, color, 0, &nodeComm);
MPI_Comm_rank(nodeComm, &myrank);
MPI_Comm_size(nodeComm, &gpu_per_node);
p2myrank[0]=myrank;
return;
/* Find out how many DP capable GPUs are in the system and their device number */
int deviceCount,slot=0;
int *devloc;
cudaGetDeviceCount(&deviceCount);
devloc=(int *)malloc(deviceCount*sizeof(int));
devloc[0]=999;
for (dev = 0; dev < deviceCount; ++dev)
{
cudaGetDeviceProperties(&deviceProp, dev);
if(deviceProp.major>1)
{
devloc[slot]=dev;
slot++;
};
}
//printf ("Assigning device %d to process on node %s rank %d \n",devloc[myrank], host_name, rank );
/* Assign device to MPI process and probe device properties */
cudaSetDevice(devloc[myrank]);
cudaGetDevice(&dev);
cudaGetDeviceProperties(&deviceProp, dev);
size_t free_bytes, total_bytes;
cudaMemGetInfo(&free_bytes, &total_bytes);
printf("Host: %s Rank=%d Device= %d (%s) ECC=%s Free = %lu, Total = %lu\n",host_name,rank, devloc[myrank],deviceProp.name, deviceProp.ECCEnabled ? "Enabled " : "Disabled", (unsigned long)free_bytes, (unsigned long)total_bytes);
}
我的fortran代码是:
subroutine MPI_to_gpu_assign(comm,nprocs)
use cudafor
use sort
implicit none
include "mpif.h"
integer:: max_len, rank, code, comm,i,size, ierr,totaldev,n, namelen, color, nprocs
integer:: nodeComm,first_time ,myrank, proc_len
character:: host_name(MPI_MAX_PROCESSOR_NAME)
character:: host_names(nprocs*MPI_MAX_PROCESSOR_NAME)
proc_len = MPI_MAX_PROCESSOR_NAME
!Check if the device has been assigned already
if(first_time) then
first_time=0
call MPI_Comm_rank(comm, rank,code)
call MPI_Get_processor_name(host_name,namelen,code)
host_names((rank-1)*proc_len+1:rank*proc_len) = host_name
do n=1,nprocs
if (n.gt.1) then
call MPI_Bcast(host_names((n-1)*proc_len+1:n*proc_len),MPI_MAX_PROCESSOR_NAME, MPI_CHARACTER, n-1, comm,code)
else
call MPI_Bcast(host_names(1:proc_len),MPI_MAX_PROCESSOR_NAME, MPI_CHARACTER, n-1, comm,code)
end if
end do
call a_sort(host_names,my_compare)
color = 0
DO n = 1,nprocs
if((n.gt.1)) then
if((my_compare( host_names(((n-2)*proc_len+1):(n-1)*proc_len), host_names(((n-1)*proc_len+1):n*proc_len) )) == 1) then !!line 1!!
color = color+1
end if
if(my_compare(host_name, host_names((n-1)*proc_len+1:n*proc_len)) == 1) then !!line 2!!
exit
end if
else
if(my_compare(host_name, host_names(1:proc_len)) == 1) then !!line 3!!
exit
end if
end if
END DO
call MPI_Comm_split(comm, color, 0, nodeComm,code)
CALL MPI_Comm_rank(nodeComm, myrank,code)
write(*,*) 'Assigning device', myrank, 'to process on node', host_name,' on rank', rank,''
! Assign device to MPI process
ierr = cudaSetDevice(myrank)
if (ierr.ne.0) then
print *, cudaGetErrorString(ierr)
stop
endif
end if
end subroutine MPI_to_gpu_assign
将sort和其他必需的函数定义为(我在here中使用):
module sort
implicit none
contains
subroutine To_lower(str)
character(len=*), intent(in out) :: str
integer :: i
do i = 1, len(str)
select case(str(i:i))
case("A":"Z")
str(i:i) = achar(iachar(str(i:i))+32)
end select
end do
end subroutine To_Lower
integer function my_compare(a, b)
character(*), intent(in) :: a, b
character(len=max(len(a),len(b))) :: a1, b1
a1 = a
b1 = b
call to_lower(b1)
call to_lower(a1)
if ( len(trim(a)) > len(trim(b)) ) then
my_compare = -1
elseif ( len(trim(a)) == len(trim(b)) ) then
if ( a1 > b1 ) then
my_compare = 1
else
my_compare = -1
end if
else
my_compare = 1
end if
end function my_compare
subroutine a_sort(a, cc)
character(len=*), dimension(:), intent(inout) :: a
interface
integer function cc(a, b)
character(len=*), intent(in) :: a, b
end function cc
end interface
integer :: i, j, increment
character(len=max(len(a), 10)) :: temp
increment = size(a) / 2
do while ( increment > 0 )
do i = increment+1, size(a)
j = i
temp = a(i)
do while ( j >= increment+1 .and. cc(a(j-increment), temp) > 0)
a(j) = a(j-increment)
j = j - increment
end do
a(j) = temp
end do
if ( increment == 2 ) then
increment = 1
else
increment = increment * 5 / 11
end if
end do
end subroutine a_sort
end module Sort
但这似乎不起作用,并给我以下错误:
PGF90-S-0446-Argument number 1 to my_compare: rank mismatch (line 1)
PGF90-S-0446-Argument number 2 to my_compare: rank mismatch (line 1)
PGF90-S-0446-Argument number 1 to my_compare: rank mismatch (line 2)
PGF90-S-0446-Argument number 2 to my_compare: rank mismatch (line 2)
PGF90-S-0446-Argument number 1 to my_compare: rank mismatch (line 3)
PGF90-S-0446-Argument number 2 to my_compare: rank mismatch (line 3)
任何人都可以帮助我为什么会出现这个错误吗?
答案 0 :(得分:1)
您的错误与CUDA无关,这是一个基本的Fortran错误。您在调用代码中声明了长度为1个字符的数组:
character:: host_name(MPI_MAX_PROCESSOR_NAME)
character:: host_names(nprocs*MPI_MAX_PROCESSOR_NAME)
并从中创建数组部分并将其传递给my_compare
:
my_compare( host_names(((n-2)*proc_len+1):(n-1)*proc_len), host_names(((n-1)*proc_len+1):n*proc_len) )
但my_compare
需要标量字符:
integer function my_compare(a, b)
character(*), intent(in) :: a, b
这是不兼容的。您也应该在主代码中使用标量字符:
character(MPI_MAX_PROCESSOR_NAME) :: host_name
character(nprocs*MPI_MAX_PROCESSOR_NAME) :: host_names
实际上,在Fortran中,最好的方法是使用主机名数组
character(MPI_MAX_PROCESSOR_NAME) :: host_names(nprocs)
但你必须改变很多来自C的代码。但这将是一种简化。