为什么MPI_Allreduce()挂起?

时间:2018-08-07 00:38:48

标签: c mpi

我遇到了MPI_Allreduce的问题,其中某些进程早早退出执行循环,导致MPI_Allreduce似乎挂起。我提供的代码是一个简单的示例,它演示了驱动该应用程序的主循环如下所示的问题:

  for(j = 0; j < ITERATIONS; j++)
  {
    reduce_stddev(world_rank, world_size, num_elements_per_proc);

    if( (j == (ITERATIONS/2)) && (world_rank % 2 == 0))
    {
      fprintf(stdout, "%d exiting\n", world_rank);
      fflush(stdout);
      break;
    }
  }

嵌套的if测试会使某些进程比其他进程更早退出循环,当这种情况发生时,从MPI_Allreducereduce_stddev的调用似乎已挂起。下面是完整的代码清单:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#include <math.h>
#include <assert.h>

#define N_ITEMS 100000
#define ITERATIONS 32 

float *create_rand_nums(int num_elements) {
  float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
  assert(rand_nums != NULL);
  int i;
  for (i = 0; i < num_elements; i++) {
    rand_nums[i] = (rand() / (float)RAND_MAX);
  }
  return rand_nums;
}

void reduce_stddev(int world_rank, int world_size, int num_elements_per_proc)
{
  fprintf(stdout, "Calling %s: %d\n", __func__, world_rank);
  fflush(stdout);

  srand(time(NULL)*world_rank);
  float *rand_nums = NULL;
  rand_nums = create_rand_nums(num_elements_per_proc);

  float local_sum = 0;
  int i;
  for (i = 0; i < num_elements_per_proc; i++) {
    local_sum += rand_nums[i];
  }

  float global_sum;
  fprintf(stdout, "%d: About to call all reduce\n", world_rank);
  fflush(stdout);
  MPI_Allreduce(&local_sum, &global_sum, 1, MPI_FLOAT, MPI_SUM,
                MPI_COMM_WORLD);
  fprintf(stdout, "%d: done calling all reduce\n", world_rank);
  fflush(stdout);
  float mean = global_sum / (num_elements_per_proc * world_size);

  float local_sq_diff = 0;
  for (i = 0; i < num_elements_per_proc; i++) {
    local_sq_diff += (rand_nums[i] - mean) * (rand_nums[i] - mean);
  }

  float global_sq_diff;
  MPI_Reduce(&local_sq_diff, &global_sq_diff, 1, MPI_FLOAT, MPI_SUM, 0,
             MPI_COMM_WORLD);

  if (world_rank == 0) {
    float stddev = sqrt(global_sq_diff /
                        (num_elements_per_proc * world_size));
    printf("Mean - %f, Standard deviation = %f\n", mean, stddev);
  }

  free(rand_nums);
}

int main(int argc, char* argv[]) {
  if (argc != 2) {
    fprintf(stderr, "Usage: avg num_elements_per_proc\n");
    exit(1);
  }

  int num_elements_per_proc = atoi(argv[1]);

  MPI_Init(NULL, NULL);

  int world_rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
  int world_size;
  MPI_Comm_size(MPI_COMM_WORLD, &world_size);

  unsigned long long j = 0;

  for(j = 0; j < ITERATIONS; j++)
  {
    reduce_stddev(world_rank, world_size, num_elements_per_proc);

    if( (j == (ITERATIONS/2)) && (world_rank % 2 == 0))
    {
      fprintf(stdout, "%d exiting\n", world_rank);
      fflush(stdout);
      break;
    }
  }

  MPI_Barrier(MPI_COMM_WORLD);
  MPI_Finalize();

  return EXIT_SUCCESS;
}

最后看到的输出是“ [进程等级]即将调用所有reduce”。

0 个答案:

没有答案