我正在尝试使用openmp工作共享结构。共享的代码是一个更简单的例子,说明我的更大的openmp代码出了什么问题。我将值分配给整数矩阵,打印矩阵元素值,将它们初始化为0并在“t'环。我计算了值分配(并行完成)在整数' p'中失败的次数。如果代码是正确的,则p应该为0,但它为不同的运行提供了不同的答案,因此工作结构在某处失败。在得到第一个错误的p值作为输出(1,2,3等)之前,我必须运行它大约12次
代码中的屏障指令并不是必需的,没有它我得到不同的p值,并认为明确的障碍会有所帮助,但我错了。这是代码:
#define NRA 10 /* number of rows in matrix A */
#define NCA 10 /* number of columns in matrix A */
int main()
{
int i, j, ir, p = 0, t;
int *a;
a = (int*) malloc(sizeof(int)*NRA*NCA);
omp_set_num_threads(5);
for(t=0;t<100000;t++)
{
#pragma omp barrier
#pragma omp parallel for schedule (static,2) collapse(2)
for(i=0;i<NRA;i++)
{
for(j=0;j<NCA;j++)
{
ir=j*NRA+i;
a[ir] = 1;
}
}
#pragma omp single
{
for(i=0;i<NRA;i++)
{
for(j=0;j<NCA;j++)
{
ir=j*NRA+i;
if(a[ir] != 1)
{
p += 1;
}
}
}
}
#pragma omp parallel for schedule (static,2) collapse(2)
for(i=0;i<NRA;i++)
{
for(j=0;j<NCA;j++)
{
ir=j*NRA+i;
a[ir] = 0;
}
}
# pragma omp barrier
}//end t
printf("p is %d\n",p);
}
这是更大的代码,我不认为竞争条件是一个问题,因为我声明并行循环之外的所有变量共享以及并行循环内的所有其他变量。任何建议都会有所帮助!
#define NRA 10 /* number of rows in matrix A */
#define NCA 10 /* number of columns in matrix A */
#define NCB 10 /* number of columns in matrix B */
void matrixcalc (double *ad, double *bd, double *cd, int chunkd);
void printresults (double *cd, int chunkd);
void printrep (double *cd, int chunkd);
int main ()
{
int nthreads, chunk, p = 0;
double *a,*b,*c;
a = (double*)malloc(NRA*NCA*sizeof(double));
if(a==NULL)
printf("ho\n");
b = (double*)malloc(NCA*NCB*sizeof(double));
c = (double*)malloc(NRA*NCB*sizeof(double));
omp_set_num_threads(5);
chunk = 2; /* set loop iteration chunk size */
int ir3, i1, j1;
/*** Spawn a parallel region explicitly scoping all variables ***/
int t, tmax = 100000;
for(t=0;t<tmax;t++)
{
#pragma omp parallel shared(a,b,c,nthreads,chunk,t,tmax)
{
int tid = omp_get_thread_num();
int i, j, ir;
if (tid == 0)
{
nthreads = omp_get_num_threads();
// printf("Starting matrix multiple example with %d threads\n",nthreads);
// printf("Initializing matrices...\n");
}
/*** Initialize matrices ***/
#pragma omp for schedule (static, chunk) collapse(2)
for (i=0; i<NRA; i++)
{
for (j=0; j<NCA; j++)
{
ir =j*NRA+i;
a[ir]= 1.0;
}
}
#pragma omp for schedule (static, chunk) collapse(2)
for (i=0; i<NCA; i++)
{
for (j=0; j<NCB; j++)
{
ir = j*NCA+i;
b[ir] = 1.0;
}
}
#pragma omp for schedule (static, chunk) collapse(2)
for (i=0; i<NRA; i++)
{
for (j=0; j<NCB; j++)
{
ir=j*NRA+i;
c[ir]= 0.0;
}
}
/*** Do matrix multiply sharing iterations on outer loop ***/
/*** Display who does which iterations for demonstration purposes ***/
matrixcalc(a,b,c,chunk);
if(t!=tmax-1)
{
#pragma omp for schedule (static, chunk) collapse(2)
for(i=0;i<NRA;i++)
{
for(j=0;j<NCB;j++)
{
ir=j*NRA+i;
c[ir]=0.0;
}
}
}
}//end parallel region
for(i1=0;i1<NRA;i1++)
{
for(j1=0;j1<NCB;j1++)
{
ir3=j1*NRA+i1;
if(c[ir3]!=12.20000&&c[ir3]!=0.0)
{
printf("%lf\n",c[ir3]);
p+=1;
}
}
}
}//end t
printf("finalp\t%d\n",p);
for(i1=0;i1<NRA;i1++)
{
for(j1=0;j1<NCB;j1++)
{
ir3=j1*NRA+i1;
printf("%lf\t",c[ir3]);
}
printf("\n");
}
}
void matrixcalc (double *a, double *b, double *c, int chunk)
{
int i,j,k,ir,ir1,ir2;
//printf("Thread %d starting matrix multiply...%d\n",tid,chunk);
double r = 1.0;
#pragma omp for schedule (static, chunk) collapse(3)
for (i=0; i<NRA; i++)
{
for(j=0; j<NCB; j++)
{
for (k=0; k<NCA; k++)
{
ir=j*NRA+i;
ir1=k*NRA+i;
ir2=j*NCA+k;
c[ir] += a[ir1] * b[ir2];
}
}
}
#pragma omp for schedule (static, chunk) collapse(2)
for(i=0;i<NRA;i++)
{
for(j=0;j<NCB;j++)
{
ir=j*NRA+i;
c[ir]+=r*2.0;
}
}
#pragma omp single
{
double h;
h = 0.1;
h = 2.0*h;
for(i=0;i<NRA;i++)
{
for(j=0;j<NCB;j++)
{
ir=j*NRA+i;
c[ir]+=h;
}
}
}
答案 0 :(得分:2)
问题是ir
上的竞争条件。由于它是在循环之外定义的,因此隐含shared
。您可以强制它为private
,但最好尽可能在本地声明变量。这使得对OpenMP代码的推理变得更加容易:
#pragma omp parallel for schedule (static,2) collapse(2)
for(int i=0;i<NRA;i++)
{
for(int j=0;j<NCA;j++)
{
int ir = j*NRA+i;
a[ir] = 1;
}
}
正如JorgeBellón评论的那样,您的代码中存在其他问题,涉及冗余障碍和效率。