Question

我正在尝试使用MPI共享内存功能。我有几个SMP节点，每个节点都有四个核心。我需要一个大小为N的数组，每个节点应由每个节点中的所有四个核访问。我的计划是使用MPI_Win_allocate_shared构建一个大小为N / 4的共享窗口，我希望每个节点的内存使用量为N.在下面的例子中，N是4X10 ^ 9字节，但每个节点的内存使用量不是4GB但16GB。我错过了什么吗？

#include <iostream>
#include <mpi.h>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int rank_all;
   int rank_sm;
   int size_sm;

   // all communicator
   MPI_Comm comm_sm;
   MPI_Comm_rank(MPI_COMM_WORLD, &rank_all);

   // shared memory communicator
   MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &comm_sm);
   MPI_Comm_rank(comm_sm, &rank_sm);
   MPI_Comm_size(comm_sm, &size_sm);

   std::size_t local_window_count(1000000000);

   char* base_ptr;
   MPI_Win win_sm;
   int disp_unit(sizeof(char));
   MPI_Win_allocate_shared(local_window_count * disp_unit, disp_unit, MPI_INFO_NULL, comm_sm, &base_ptr, &win_sm);

   // write
   char buffer;
   if (rank_sm == 0) {
      buffer = 'A';
   }
   else if (rank_sm == 1) {
      buffer = 'C';
   }
   else if (rank_sm == 2) {
      buffer = 'G';
   }
   else {
      buffer = 'T';
   }

   MPI_Win_fence(0, win_sm);

   for (std::size_t it = 0; it < local_window_count; it++) {
      base_ptr[it] = buffer;
   }

   MPI_Win_fence(0, win_sm);

   // read
   long long int index_start(-1 * rank_sm * local_window_count);
   long long int index_end((size_sm - rank_sm) * local_window_count - 1);

   for (long long int it_rel = index_start; it_rel < index_end; it_rel++) {
      buffer = base_ptr[it_rel];

      if (it_rel == index_start) {
         std::cout << rank_sm << " start: " << buffer << std::endl;
      }
      else if (it_rel == (index_end - 1)) {
         std::cout << rank_sm << " end: " << buffer << std::endl;
      }
   }

   MPI_Finalize();

   return 0;
}

MPI共享内存的总内存使用量

0 个答案: