#pragma omp并行降低速度而不是增加 - 在Lattice Boltzmann中

时间:2013-10-17 23:30:12

标签: c parallel-processing openmp

我是OpenMP的新手,我正在尝试并行化这个循环:

int ii,jj,kk;                 /* generic counters */
const double c_sq = 1.0/3.0;  /* square of speed of sound */
const double w0 = 4.0/9.0;    /* weighting factor */
const double w1 = 1.0/9.0;    /* weighting factor */
const double w2 = 1.0/36.0;   /* weighting factor */
double u_x,u_y;               /* av. velocities in x and y directions */
double u[NSPEEDS];            /* directional velocities */
double d_equ[NSPEEDS];        /* equilibrium densities */
double u_sq;                  /* squared velocity */
double local_density;         /* sum of densities in a particular cell */

/* loop over the cells in the grid
** NB the collision step is called after
** the propagate step and so values of interest
** are in the scratch-space grid */
//#pragma omp parallel for private (ii, jj, kk, d_equ) shared (cells, tmp_cells)
for(ii=0;ii<params.ny;ii++) {
 for(jj=0;jj<params.nx;jj++) {
  /* don't consider occupied cells */
  if(!obstacles[ii*params.nx + jj]) {
    /* compute local density total */
    local_density = 0.0;
    for(kk=0;kk<NSPEEDS;kk++) {
      local_density += tmp_cells[ii*params.nx + jj].speeds[kk];
    }
    /* compute x velocity component */
    u_x = (tmp_cells[ii*params.nx + jj].speeds[1] +
           tmp_cells[ii*params.nx + jj].speeds[5] +
           tmp_cells[ii*params.nx + jj].speeds[8]
           - (tmp_cells[ii*params.nx + jj].speeds[3] +
              tmp_cells[ii*params.nx + jj].speeds[6] +
              tmp_cells[ii*params.nx + jj].speeds[7]))
      / local_density;
    u_y = (tmp_cells[ii*params.nx + jj].speeds[2] +
           tmp_cells[ii*params.nx + jj].speeds[5] +
           tmp_cells[ii*params.nx + jj].speeds[6]
           - (tmp_cells[ii*params.nx + jj].speeds[4] +
              tmp_cells[ii*params.nx + jj].speeds[7] +
              tmp_cells[ii*params.nx + jj].speeds[8]))
      / local_density;
    /* velocity squared */
    u_sq = u_x * u_x + u_y * u_y;
    /* directional velocity components */
    u[1] =   u_x;        /* east */
    u[2] =         u_y;  /* north */
    u[3] = - u_x;        /* west */
    u[4] =       - u_y;  /* south */
    u[5] =   u_x + u_y;  /* north-east */
    u[6] = - u_x + u_y;  /* north-west */
    u[7] = - u_x - u_y;  /* south-west */
    u[8] =   u_x - u_y;  /* south-east */
    /* equilibrium densities */
    /* zero velocity density: weight w0 */
    d_equ[0] = w0 * local_density * (1.0 - u_sq / (2.0 * c_sq));
    /* axis speeds: weight w1 */
    d_equ[1] = w1 * local_density * (1.0 + u[1] / c_sq
                                     + (u[1] * u[1]) / (2.0 * c_sq * c_sq)
                                     - u_sq / (2.0 * c_sq));
    d_equ[2] = w1 * local_density * (1.0 + u[2] / c_sq
                                     + (u[2] * u[2]) / (2.0 * c_sq * c_sq)
                                     - u_sq / (2.0 * c_sq));
    d_equ[3] = w1 * local_density * (1.0 + u[3] / c_sq
                                     + (u[3] * u[3]) / (2.0 * c_sq * c_sq)
                                     - u_sq / (2.0 * c_sq));
    d_equ[4] = w1 * local_density * (1.0 + u[4] / c_sq
                                     + (u[4] * u[4]) / (2.0 * c_sq * c_sq)
                                     - u_sq / (2.0 * c_sq));
    /* diagonal speeds: weight w2 */
    d_equ[5] = w2 * local_density * (1.0 + u[5] / c_sq
                                     + (u[5] * u[5]) / (2.0 * c_sq * c_sq)
                                     - u_sq / (2.0 * c_sq));
    d_equ[6] = w2 * local_density * (1.0 + u[6] / c_sq
                                     + (u[6] * u[6]) / (2.0 * c_sq * c_sq)
                                     - u_sq / (2.0 * c_sq));
    d_equ[7] = w2 * local_density * (1.0 + u[7] / c_sq
                                     + (u[7] * u[7]) / (2.0 * c_sq * c_sq)
                                     - u_sq / (2.0 * c_sq));
    d_equ[8] = w2 * local_density * (1.0 + u[8] / c_sq
                                     + (u[8] * u[8]) / (2.0 * c_sq * c_sq)
                                     - u_sq / (2.0 * c_sq));
    /* relaxation step */
    for(kk=0;kk<NSPEEDS;kk++) {
      cells[ii*params.nx + jj].speeds[kk] = (tmp_cells[ii*params.nx + jj].speeds[kk]
                                             + params.omega *
                                             (d_equ[kk] - tmp_cells[ii*params.nx + jj].speeds[kk]));
      }
    }
  }
}

params.nx = 300&amp;对于300x200 d2q9 LB立方体,params.ny = 200 ...我评论的实用语句只会导致增加和减少;同时也会抛出雷诺数...我尝试合并2 for循环以避免可能的错误共享,使它看起来像这样:

c=params.nx*params.ny;
#pragma omp for private (ii,jj,kk,d_equ) shared (cells, tmp_cells)
for(ii=0;ii<c;ii++) {
  /* don't consider occupied cells */
   if(obstacles[ii]) {
    /* called after propagate, so taking values from scratch space
    ** mirroring, and writing into main grid */
    cells[ii].speeds[1] = tmp_cells[ii].speeds[3];
    cells[ii].speeds[2] = tmp_cells[ii].speeds[4];
    ......
    ....
  }

该pragma仍然无法帮助我加快速度..虽然我在这里得到了正确的结果..自上周六以来我一直在研究这个问题并且没有结束我收费的任何地方..没有找到很多帮助在线资源..我真的很感激帮助...

1 个答案:

答案 0 :(得分:0)

这是我疯狂的猜测:

     
  • 您的#pragma decaration可能缺少parallel关键字,因此阻止了循环并行化。
  •  
  • 您在代码开头声明的大多数变量都没有定义为私有,因此它们被隐式定义为共享。这使得它们在第一种情况下容易受到竞争条件的影响(但不是在第二种情况下,因为您的代码按顺序运行)。您应该将它们定义为私有或(甚至更好)在for循环中声明它们,从而使它们成为私有的:
for(i = 0; i < params.nx * params.ny ; i++) {
    double u[NSPEEDS];
    double d_equ[NSPEEDS];
    ...
    int kk;
}