当我使用命令#34运行我的代码时; mpirun -hosts o251-12,o251-13 ./matrixmult"我键入ijk R 4,它给出了错误。更糟糕的是,如果我再次运行,错误有时会改变。 如果我使用1或2而不是4,它运行良好,但如果我使用大数字,它会给出错误。我试过改变我的代码,但是没有用。
这是我的代码
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#include <string.h>
void putValue(int *A, int *B, char *flag, int n);
void ijk(int *A, int *B, int *result, int n, int tmpAn);
void ikj(int *A, int *B, int *result, int n, int tmpAn);
void kij(int *A, int *B, int *result, int n, int tmpAn);
int main(){
int *A;
int *B;
int *result;
int n;
char flag[2];
char form[4];
int my_rank;
int comm_sz;
double time1;
double time2;
double time;
int i;
int j;
int *tmpA;
int *tmpResult;
int *sc;
int *displs;
int alpha;
int d;
MPI_Init(NULL,NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD,&comm_sz);
MPI_Barrier(MPI_COMM_WORLD);
if(my_rank==0){
time1=MPI_Wtime();
}
if(my_rank==0){
scanf("%s", form);
scanf("%s", flag);
scanf("%d", &n);
A = (int*)malloc(n * n * sizeof(int));
B = (int*)malloc(n * n * sizeof(int));
result = (int*)malloc(n * n * sizeof(int));
putValue(A,B,flag,n);
printf("running on %d processors\n", comm_sz);
}
if(my_rank){
B = (int*)malloc(n * n * sizeof(int));
}
MPI_Bcast(&n, 1, MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(B,n*n,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&form, 4, MPI_CHAR,0,MPI_COMM_WORLD);
sc = (int*)malloc(comm_sz * sizeof(int));
displs = (int*)malloc(comm_sz * sizeof(int));
alpha = n%comm_sz;
d=0;
for(i=0;i<comm_sz;i++){
sc[i] = n/comm_sz;
if(i<alpha){
sc[i] = sc[i]+1;
}
sc[i]=sc[i]*n;
displs[i] = d;
d+=sc[i];
}
tmpA = (int*)malloc(n * sc[my_rank] * sizeof(int));
tmpResult = (int*)malloc(n * sc[my_rank] * sizeof(int));
for(i=0; i<sc[my_rank]; i++){
tmpResult[i]=0;
}
MPI_Scatterv(A,sc,displs,MPI_INT,tmpA,sc[my_rank],MPI_INT,0,MPI_COMM_WORLD);
if(strcmp(form,"ijk")==0){
ijk(tmpA,B,tmpResult,n,sc[my_rank]);
}
else if(strcmp(form,"ikj")==0){
ikj(tmpA,B,tmpResult,n,sc[my_rank]);
}
else if(strcmp(form,"kij")==0){
kij(tmpA,B,tmpResult,n,sc[my_rank]);
}
MPI_Gatherv(tmpResult,sc[my_rank],MPI_INT,result,sc,displs,MPI_INT,0,MPI_COMM_WORLD);
if(my_rank==0){
time2=MPI_Wtime();
time=time2-time1;
printf("elapsed time = %.6e secondes\n",time);
if(strcmp(flag,"I")==0){
for(i=0;i<n;i++){
for(j=0;j<n;j++){
printf("%d ",result[i*n+j]);
}
printf("\n");
}
}
}
MPI_Finalize();
return 0;
}
void putValue(int *A, int *B, char *flag, int n){
int i,j;
srand((unsigned)time(NULL));
if(strcmp(flag,"R")==0){
for(i=0; i<n; i++){
for(j=0; j<n; j++){
A[i*n+j] = (int)rand()%101;
B[i*n+j] = (int)rand()%101;
printf("%d\n",A[i*n+j]);
printf("%d\n",B[i*n+j]);
}
}
}
else if(strcmp(flag,"I")==0){
for(i=0; i<n; i++){
for(j=0; j<n; j++){
int x;
scanf("%d", &x);
A[i*n+j]=x;
}
}
for(i=0; i<n; i++){
for(j=0; j<n; j++){
int x;
scanf("%d", &x);
B[i*n+j]=x;
}
}
}
}
void ijk(int *A, int *B, int *result, int n, int tmpAn){
int i,j,k;
for(i=0;i<(tmpAn/n);i++){
for(j=0;j<n;j++){
for(k=0;k<n;k++){
result[i*n+j] += (A[i*n+k] * B[k*n+j]);
}
}
}
}
void ikj(int *A, int *B, int *result, int n, int tmpAn){
int i,j,k;
for(i=0;i<(tmpAn/n);i++){
for(k=0;k<n;k++){
for(j=0;j<n;j++){
result[i*n+j] += (A[i*n+k] * B[k*n+j]);
}
}
}
}
void kij(int *A, int *B, int *result, int n, int tmpAn){
int i,j,k;
for(k=0;k<n;k++){
for(i=0;i<(tmpAn/n);i++){
for(j=0;j<n;j++){
result[i*n+j] += (A[i*n+k] * B[k*n+j]);
}
}
}
}
有时错误是
rank = 3, revents = 25, state = 8
Assertion failed in file ../../src/mpid/ch3/channels/nemesis/netmod/tcp/socksm.c at line 2988: (it_plfd->revents & POLLERR) == 0
internal ABORT - process 1
有时错误是
Fatal error in PMPI_Gatherv: Unknown error class, error stack:
PMPI_Gatherv(1001)....................: MPI_Gatherv failed(sbuf=0x231f140,
scount=4, MPI_INT, rbuf=0x231f060, rcnts=0x231f0b0, displs=0x231f0d0, MPI_INT, root=0, MPI_COMM_WORLD) failed
MPIR_Gatherv_impl(545)................: fail failed
I_MPIR_Gatherv_intra(617).............: Failure during collective
I_MPIR_Gatherv_intra(590).............: fail failed
MPIR_Gatherv_advanced(720)............: fail failed
MPIDU_Complete_posted_with_error(1710): Process failed
====================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 17870 RUNNING AT o251-13
= EXIT CODE: 134
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
答案 0 :(得分:0)
此行由根进程
执行scanf("%d", &n);
此行由广播n
之前的其余部分执行:
B = (int*)malloc(n * n * sizeof(int));
在广播之后移动那个alloc语句。