我正在尝试使用MPI共享内存功能。我有几个SMP节点,每个节点都有四个核心。我需要一个大小为N的数组,每个节点应由每个节点中的所有四个核访问。我的计划是使用MPI_Win_allocate_shared构建一个大小为N / 4的共享窗口,我希望每个节点的内存使用量为N.在下面的例子中,N是4X10 ^ 9字节,但每个节点的内存使用量不是4GB但16GB。我错过了什么吗?
#include <iostream>
#include <mpi.h>
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int rank_all;
int rank_sm;
int size_sm;
// all communicator
MPI_Comm comm_sm;
MPI_Comm_rank(MPI_COMM_WORLD, &rank_all);
// shared memory communicator
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &comm_sm);
MPI_Comm_rank(comm_sm, &rank_sm);
MPI_Comm_size(comm_sm, &size_sm);
std::size_t local_window_count(1000000000);
char* base_ptr;
MPI_Win win_sm;
int disp_unit(sizeof(char));
MPI_Win_allocate_shared(local_window_count * disp_unit, disp_unit, MPI_INFO_NULL, comm_sm, &base_ptr, &win_sm);
// write
char buffer;
if (rank_sm == 0) {
buffer = 'A';
}
else if (rank_sm == 1) {
buffer = 'C';
}
else if (rank_sm == 2) {
buffer = 'G';
}
else {
buffer = 'T';
}
MPI_Win_fence(0, win_sm);
for (std::size_t it = 0; it < local_window_count; it++) {
base_ptr[it] = buffer;
}
MPI_Win_fence(0, win_sm);
// read
long long int index_start(-1 * rank_sm * local_window_count);
long long int index_end((size_sm - rank_sm) * local_window_count - 1);
for (long long int it_rel = index_start; it_rel < index_end; it_rel++) {
buffer = base_ptr[it_rel];
if (it_rel == index_start) {
std::cout << rank_sm << " start: " << buffer << std::endl;
}
else if (it_rel == (index_end - 1)) {
std::cout << rank_sm << " end: " << buffer << std::endl;
}
}
MPI_Finalize();
return 0;
}