我只是想学习MPI,我正在使用MPI并行化矩阵乘法。以下是我的代码
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>
#define CLK CLOCK_MONOTONIC
struct timespec diff(struct timespec start, struct timespec end){
struct timespec temp;
if((end.tv_nsec-start.tv_nsec)<0){
temp.tv_sec = end.tv_sec-start.tv_sec-1;
temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
}
else{
temp.tv_sec = end.tv_sec-start.tv_sec;
temp.tv_nsec = end.tv_nsec-start.tv_nsec;
}
return temp;
}
int main(int argc, char* argv[])
{
struct timespec start_e2e, end_e2e, start_alg, end_alg, e2e, alg;
/* Should start before anything else */
clock_gettime(CLK, &start_e2e);
/* Check if enough command-line arguments are taken in. */
if(argc < 3) {
printf( "Usage: %s n p \n", argv[0] );
return -1;
}
MPI_Init(NULL, NULL);
const int n = atoi(argv[1]);
const int p = atoi(argv[2]);
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int** matA = (int **) malloc(n * sizeof(int *));
int** matB = (int **) malloc(n * sizeof(int *));
int** matC = (int **) malloc(n * sizeof(int *));
int i, j;
for(i = 0; i < n; i++)
{
matA[i] = (int *) malloc(n * sizeof(int));
matB[i] = (int *) malloc(n * sizeof(int));
matC[i] = (int *) malloc(n * sizeof(int));
for(j = 0; j < n; j++)
{
matB[i][j] = 1; // Initialize
matC[i][j] = 0; // Initialize
}
}
// Total number of processors
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if(world_rank == 0)
{
for(i = 0; i < n; i++)
{
for(j = 0; j < n; j++)
matA[i][j] = 2;
}
int destination;
double start = MPI_Wtime();
clock_gettime(CLK, &start_alg); /* Start the algo timer */
for(destination = 1; destination < world_size; destination++)
{
int start = destination * (n / world_size);
int end = (destination + 1) * (n / world_size);
if(destination == world_size - 1)
end = n;
int offset = start;
int rows = (end - start);
MPI_Send(&offset, 1, MPI_INT, destination, 1, MPI_COMM_WORLD); // Send offset
MPI_Send(&rows, 1, MPI_INT, destination, 2, MPI_COMM_WORLD); // Send number of rows
MPI_Send(&matA[offset][0], rows * n, MPI_INT, destination, 3, MPI_COMM_WORLD); // Send portion of matrix A
}
double sending = MPI_Wtime();
// Do matrix multiplication specific to master processor
int k;
int rows = n / world_size;
for(i = 0; i < rows; i++)
{
for(j = 0; j < n; j++)
{
for(k = 0; k < n; k++)
matC[i][j] += (matA[i][k] * matB[k][j]);
}
}
// Wait for other processors to complete and combine their results
double receiving = MPI_Wtime();
int source;
for(source = 1; source < world_size; source++)
{
int offset, rows;
MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive offset
MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive number of rows
MPI_Recv(&matC[offset][0], rows * n, MPI_INT, source, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive portion of matrix C
}
double end = MPI_Wtime();
clock_gettime(CLK, &end_alg); /* End the algo timer */
clock_gettime(CLK, &end_e2e);
e2e = diff(start_e2e, end_e2e);
alg = diff(start_alg, end_alg);
printf("%s,%s,%d,%d,%d,%ld,%d,%ld\n", problem_name, approach_name, n, p, e2e.tv_sec, e2e.tv_nsec, alg.tv_sec, alg.tv_nsec);
}
else
{
int offset;
int rows;
MPI_Recv(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive offset
MPI_Recv(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive number of rows
MPI_Recv(&matA[offset][0], rows * n, MPI_INT, 0, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // Receive portion of matrix A
int k;
// Do matrix multiplication
for(i = offset; i < offset + rows; i++) {
for(j = 0; j < n; j++) {
for(k = 0; k < n; k++) {
matC[i][j] += (matA[i][k] * matB[k][j]);
}
}
}
MPI_Send(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD); // Send offset
MPI_Send(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD); // Send number of rows
MPI_Send(&matC[offset][0], rows * n, MPI_INT, 0, 3, MPI_COMM_WORLD); // Send portion of matrix C
}
for(i = 0; i < n; i++) {
free(matA[i]);
free(matB[i]);
free(matC[i]);
}
printf("End:%d\n", world_rank);
MPI_Finalize();
}
当我在具有4个节点且每个节点具有16个核心的群集上运行程序时,最初代码可以正常运行而没有任何错误。但是经过一些随机数的运行后,代码会引发分段错误,代码会再次运行而没有任何错误。甚至在MPI_Finalize()之前的printf语句被所有进程执行(当我得到seg错误时)并且所有输出行都被正确地计算和接收但我不明白为什么它不起作用。此外,在我的笔记本电脑上只有2个物理内核时,我运行n,p的代码,这给了我群集上的seg故障,代码运行完全正常,没有任何seg故障。
This is the error trace 对于低质量的图像感到抱歉,我没有任何其他方法来提取痕迹。
提前致谢。
编辑:预期输出:存储在matC中的两个矩阵matA和matB的简单矩阵乘法。 matA的所有条目都是2,matB的所有条目都是1.因此matc在所有条目中都应该有2n,其中nxn是matA,matB和matC的维度。
编辑:错误测试用例:对于跟随n(维度),p(核心数),代码给出了seg错误。我认为这是随机的,但要提出更清楚的问题
1. n = 2048 p = 12
2. n = 64 p = 16
3. n = 1024 p = 28
4. n = 2048 p = 16等等