我在小型集群上使用OpenMPI 1.3。
这是我打电话的功能:
void invertColor_Parallel(struct image *im, int size, int rank)
{
int i,j,aux,r;
int total_pixels = (*im).ih.width * (*im).ih.height;
int qty = total_pixels/(size-1);
int rest = total_pixels % (size-1);
MPI_Status status;
//printf("\n%d\n", rank);
if(rank == 0)
{
for(i=1; i<size; i++){
j = i*qty - qty;
aux = j;
if(rest != 0 && i==size-1) {qty=qty+rest;} //para distrubuir toda la carga
//printf("\nj: %d qty: %d rest: %d\n", j, qty, rest);
MPI_Send(&aux, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG+1, MPI_COMM_WORLD);
MPI_Send(&qty, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG+2, MPI_COMM_WORLD);
MPI_Send(&(*im).array[j], qty*3, MPI_BYTE, i, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD);
}
}
else
{
MPI_Recv(&aux, 1, MPI_INT, MPI_ANY_SOURCE, MASTER_TO_SLAVE_TAG+1, MPI_COMM_WORLD,&status);
MPI_Recv(&qty, 1, MPI_INT, MPI_ANY_SOURCE, MASTER_TO_SLAVE_TAG+2, MPI_COMM_WORLD,&status);
pixel *arreglo = (pixel *)calloc(qty, sizeof(pixel));
MPI_Recv(&arreglo[0], qty*3, MPI_BYTE, MPI_ANY_SOURCE, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD,&status);
//printf("Receiving node=%d, message=%d\n", rank, aux);
for(i=0;i<qty;i++)
{
arreglo[i].R = 255-arreglo[i].R;
arreglo[i].G = 255-arreglo[i].G;
arreglo[i].B = 255-arreglo[i].B;
}
MPI_Send(&aux, 1, MPI_INT, 0, SLAVE_TO_MASTER_TAG+1, MPI_COMM_WORLD);
MPI_Send(&qty, 1, MPI_INT, 0, SLAVE_TO_MASTER_TAG+2, MPI_COMM_WORLD);
MPI_Send(&arreglo[0], qty*3, MPI_BYTE, 0, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD);
free(arreglo);
}
if (rank==0){
//printf("\nrank: %d\n", rank);
for (i=1; i<size; i++) // untill all slaves have handed back the processed data
{
MPI_Recv(&aux, 1, MPI_INT, MPI_ANY_SOURCE, SLAVE_TO_MASTER_TAG+1, MPI_COMM_WORLD,&status);
MPI_Recv(&qty, 1, MPI_INT, MPI_ANY_SOURCE, SLAVE_TO_MASTER_TAG+2, MPI_COMM_WORLD,&status);
MPI_Recv(&(*im).array[aux], qty*3, MPI_BYTE, MPI_ANY_SOURCE, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD,&status);
}
}
}
int main(int argc, char *argv[])
{
//////////time counter
clock_t begin;
int rank, size;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Status status;
int op = (int)atof(argv[1]);
char filename_toload[50];
int bright_number=0;
struct image image2;
if (rank==0)
{
loadImage(&image2, argv[2]);
}
//Broadcast the user's choice to all other ranks
MPI_Bcast(&op, 1, MPI_INT, 0, MPI_COMM_WORLD);
switch(op)
{
case 1:
if (rank==0) {begin = clock();}
MPI_Barrier(MPI_COMM_WORLD);
invertColor_Parallel(&image2, size, rank);
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0) {runningTime(begin, clock()); printf("Se invirtieron los colores de la imagen\n\n");}
break;
}
MPI_Barrier(MPI_COMM_WORLD);
if (rank==0)
{
saveImage(&image2, argv[3]);
free(image2.array);
}
MPI_Finalize();
return 0;
}
有时我会收到以下错误。
cluster@maestro:/mpi$ mpirun -np 60 -hostfile /home/hostfile paralelo
1 image.bmp out.bmp
cluster@nodo1's password:
[maestro:5194] *** An error occurred in MPI_Recv
[maestro:5194] *** on communicator MPI_COMM_WORLD
[maestro:5194] *** MPI_ERR_TRUNCATE: message truncated
[maestro:5194] *** MPI_ERRORS_ARE_FATAL (your MPI job will now abort)
--------------------------------------------------------------------------
mpirun has exited due to process rank 0 with PID 5194 on node maestro
exiting without calling "finalize". This may have caused other
processes in the application to be terminated by signals sent by
mpirun (as reported here).
--------------------------------------------------------------------------
[nodo1] [[49223,1],55][../../../../../../ompi/mca/btl/tcp/btl_tcp_frag.c:216:mca_btl_tcp_frag_recv] mca_btl_tcp_frag_recv: readv failed: Connection reset by peer (104)
取决于进程计数是否会抛出错误,例如使用-np 99
它的效果非常好。
关于发生了什么的任何想法?
答案 0 :(得分:1)
这段代码可能是罪魁祸首:
if (rank==0){
//printf("\nrank: %d\n", rank);
for (i=1; i<size; i++) // untill all slaves have handed back the processed data
{
MPI_Recv(&aux, 1, MPI_INT, MPI_ANY_SOURCE, SLAVE_TO_MASTER_TAG+1, MPI_COMM_WORLD,&status);
MPI_Recv(&qty, 1, MPI_INT, MPI_ANY_SOURCE, SLAVE_TO_MASTER_TAG+2, MPI_COMM_WORLD,&status);
MPI_Recv(&(*im).array[aux], qty*3, MPI_BYTE, MPI_ANY_SOURCE, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD,&status);
}
}
由于您(ab-)正在使用MPI_ANY_SOURCE
,因此您实际上是在为消息接收比赛创建完美的条件。完全有可能第一个MPI_Recv
匹配来自排名 i 的消息,第二个匹配来自排名 j 的消息,第三个匹配来自排名 j 的消息rank k ,其中 i , j 和 k 具有完全不同的值。因此,您可能会在错误的图像插槽中收到错误的像素数。此外,如果发生排名 k 发送的像素数超过{em> j 指定的qty
的值,则会出现截断错误(并且您是实际上得到它)。建议之词:从不轻率使用MPI_ANY_SOURCE
除非绝对确定算法是正确的,否则不会发生比赛。
将代码重写为:
if (rank==0){
//printf("\nrank: %d\n", rank);
for (i=1; i<size; i++) // untill all slaves have handed back the processed data
{
MPI_Recv(&aux, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG+1, MPI_COMM_WORLD, &status);
MPI_Recv(&qty, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG+2, MPI_COMM_WORLD, &status);
MPI_Recv(&(*im).array[aux], qty*3, MPI_BYTE, i, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD, &status);
}
}
甚至更好:
if (rank==0){
//printf("\nrank: %d\n", rank);
for (i=1; i<size; i++) // untill all slaves have handed back the processed data
{
MPI_Recv(&aux, 1, MPI_INT, MPI_ANY_SOURCE, SLAVE_TO_MASTER_TAG+1,
MPI_COMM_WORLD, &status);
MPI_Recv(&qty, 1, MPI_INT, status.MPI_SOURCE, SLAVE_TO_MASTER_TAG+2,
MPI_COMM_WORLD, &status);
MPI_Recv(&(*im).array[aux], qty*3, MPI_BYTE, status.MPI_SOURCE, SLAVE_TO_MASTER_TAG,
MPI_COMM_WORLD, &status);
}
}
这样三个接收器将始终从同一进程获取消息,并且将消除竞争条件。第二个版本的工作方式是它首先从任何等级接收消息,然后使用status.MPI_SOURCE
字段获取实际等级并将其用于后续接收。