使用Openmp初始化数据[浅水算法]

时间:2016-04-06 08:41:39

标签: c parallel-processing openmp

首先,我的英语水平很糟糕,很抱歉如果写得不好......

我正在学习如何使用OpenMP并行化C代码,我试图并行化的算法是浅水方程算法,尽管在最关键的循环中使用简单的#pragma omp parallel for我已经获得了近似性能提高40%我知道我的实现非常差,而且我不会按照自己的意愿来挤压内核。代码的结构很简单:一个'main',它分配内存并初始化一些矩阵和数组,并调用一个名为solver的函数来完成所有的工作,我把#pragma omp parallel for放在一边。

我想我可以使用并行部分来提升性能,其中内存被分配和初始化,因此每个线程都拥有所有数据,但是当我运行程序时,我没有任何提升,因为我是这是一个新手,我不知道我的想法是不好还是坏事是我的实施。我会提供一些可以提高算法性能的帮助或提示。这是我的作业,我不希望有人为我做这件事,只是一点帮助可以让我前进......

我会粘贴代码以便更好地理解:

主要功能(分配和初始化)

int main(int argc, char **argv) {

  long int i, j, m, n, M, N;
  char *ptr;
  long int s;
  int flag, verbose;
  double *Q;
  double *x, *y;
  double **ffx, **nFx, **ffy, **nFy;
  double dx, dt, epsi, delta, dy, tend, tmp, stime;

 /* Default values to use: m volumes in the x-direction and n volumes in the y-direction    
  M = 1000;
  N = 1000;

  /* create file and verbose flags */
  .......
  .......

  /* Parse command line options */
  .......
  .......


  epsi = 2.0;
  delta = 0.5;
  dx = (xend - xstart) / (double) M;
  dy = (yend - ystart) / (double) N;
  dt = dx / sqrt( 9.81 * 5.0);
  tend = 0.1;

  /* Add two ghost volumes at each side of the domain */
  m = M + 2;
  n = N + 2;

  /* Allocate memory for the domain */

  /*HERE IS WHRE I PUT THE PRAGMA FOR PARALLEL INITIALIZATION AND    ALLOCATIONS*/
  #pragma omp parallel
  {
  Q = (double *) malloc(m * n * cell_size *  sizeof(double));

  x = (double *) malloc(m * sizeof(double));
  y = (double *) malloc(n * sizeof(double));    

  /* Allocate memory for fluxes */
  ffx = (double **) malloc(cell_size * sizeof(double *));
  ffy = (double **) malloc(cell_size * sizeof(double *));
  nFx = (double **) malloc(cell_size * sizeof(double *));
  nFy = (double **) malloc(cell_size * sizeof(double *));

  ffx[0] = (double *) malloc(cell_size * m * sizeof(double));
  nFx[0] = (double *) malloc(cell_size * m * sizeof(double));
  ffy[0] = (double *) malloc(cell_size * n * sizeof(double));
  nFy[0] = (double *) malloc(cell_size * n * sizeof(double));

  for (i = 0; i < cell_size; i++) {
    ffx[i] =  ffx[0] + i * m;
    nFx[i] =  nFx[0] + i * m;
    ffy[i] =  ffy[0] + i * n;
    nFy[i] =  nFy[0] + i * n;
  }

  for (i = 0,tmp= -dx/2 + xstart; i < m; i++, tmp += dx)
    x[i] = tmp;

  for (i = 0,tmp= -dy/2 + ystart; i < n; i++, tmp += dy)
    y[i] = tmp;

  /* Set initial Gauss hump */
  for (i = 0; i < m; i++) {
    for (j = 0; j < n; j++) {
      Q(0, i, j) = 4.0;
      Q(1, i, j) = 0.0;
      Q(2, i, j) = 0.0;
    }
  }

  for (i = 1; i < m-1; i++) {
    for (j = 1; j < n-1; j++) {
      Q(0, i, j) = 4.0 + epsi * exp(-(pow(x[i] - xend / 4.0, 2) + pow(y[j] - yend / 4.0, 2)) /
                      (pow(delta, 2)));
    }
  }
  }

  //  Record start time
  stime = gettime();
  /*THIS IS THE FUNCTION WHERE THE 'WORK' IS DONE*/
  solver(Q, ffx, ffy, nFx, nFy, m, n, tend, dx, dy, dt);`
}

SOLVER FUNCTION(关键部分)

/*
  This is the main solver routine. 
*/
void solver(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
        int m, int n, double tend, double dx, double dy, double dt) {
  double bc_mask[3] = {1.0, -1.0, -1.0};
  double time;
  int i, j, k, steps;

  steps = ceil(tend / dt);  
  for (i = 0, time = 0.0; i < steps; i++, time += dt) { 

    /* Apply boundary condition */ 
    #pragma omp parallel for private(j) num_threads (NTHR)
    for (k = 0; k < cell_size; k++)    
    {
      for (j = 1; j < n - 1 ; j++)  
      {
    Q(k, 0, j) = bc_mask[k] *  Q(k, 1, j);
    Q(k, m-1, j) = bc_mask[k] *  Q(k, m-2, j);
      }
    }
    #pragma omp parallel for private(j) num_threads (NTHR)
    for (k = 0; k < cell_size; k++)  
    {
      for (j = 0; j < m; j++) 
      {
    Q(k, j, 0) = bc_mask[k] * Q(k, j, 1);
    Q(k, j, n-1) = bc_mask[k] * Q(k, j, n-2);
      }
    }

    /* Update all volumes with the Lax-Friedrich's scheme */     
    laxf_scheme_2d(Q, ffx, ffy, nFx, nFy, m, n, dx, dy, dt);  

  }
}

/*
  This is the Lax-Friedrich's scheme for updating volumes
*/
void laxf_scheme_2d(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
            int m, int n, double dx, double dy, double dt) {
  int i, j, k;

  /* Calculate and update fluxes in the x-direction */
  #pragma omp parallel for private(k,j) num_threads (NTHR)
  for (i = 1; i < n; i++) {
    fx(Q, ffx, m, n, i);
    for (k = 0; k < cell_size;  k++) 
      for (j = 1; j < m; j++)
    nFx[k][j] = 0.5 * ((ffx[k][j-1] + ffx[k][j]) - dx/dt * (Q(k, j, i) - Q(k, j-1, i)));
    for (k = 0; k < cell_size;  k++) 
      for (j = 1; j < m-1; j++)
    Q(k, j, i) = Q(k, j, i)  - dt/dx * ((nFx[k][j+1] - nFx[k][j]));

  }

  /* Calculate and update fluxes in the y-direction */
  #pragma omp parallel for private(k,j) num_threads (NTHR)
  for (i = 1; i < m; i++) {
    fy(Q, ffy, m, n, i);
    for (k = 0; k < cell_size; k++)
      for (j = 1; j < n; j++)
    nFy[k][j] = 0.5 * ((ffy[k][j-1] + ffy[k][j]) - dy/dt * (Q(k, i, j) - Q(k, i, j -1)));
    for (k = 0; k < cell_size; k++)
      for (j = 1; j <  n-1; j++) 
    Q(k,i,j) = Q(k,i,j) -  dt/dy * ((nFy[k][j+1]  -  nFy[k][j]));
  }

}

据我所知,求解器函数的循环和它的子函数没有数据依赖关系,并且由于在分配中放置并行区域并且数据初始化什么也没做,我不知道如何继续。

提前致谢!

2 个答案:

答案 0 :(得分:0)

您的代码存在多个问题。首先,您有数据竞争,因为您通过所有线程写入共享变量,例如Qxy。可以在并行区域之外进行分配,也可以仅通过一个线程执行它们(#pragma omp master#pragma omp single)。

然后,您不必在初始化部分中并行化for循环。实际上,所有这些循环都是由整个范围内的所有线程执行的(同样是数据争用并且可能存在大量的缓存争用)。您应该将#pragma omp parallel添加到这些循环中。对于嵌套循环,collapse指令可能很有用。

另外,请确保solver()laxf_scheme_2d()函数中没有数据争用。看起来,计算的大部分时间是在laxf_scheme_2d()内花费,但是,这个函数根本不是并行运行的。它是否在内部使用OpenMP?

答案 1 :(得分:0)

感谢您的回答。我在实施过程中遇到了很多问题,首先是所有工作完成的最重要的功能是laxf_scheme_2d。 关于Q变量,我有#define Q(i, j, k) Q[((k) + n * ((j) + m * (i)))]

这是laxf_scheme_2d

void laxf_scheme_2d(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
            int m, int n, double dx, double dy, double dt) {
  int i, j, k;

  /* Calculate and update fluxes in the x-direction */
  #pragma omp for
  for (i = 1; i < n; i++) {
    fx(Q, ffx, m, n, i);
    for (j = 1; j < m; j++) 
      for (k = 0; k < cell_size;  k++) 
    nFx[k][j] = 0.5 * ((ffx[k][j-1] + ffx[k][j]) -
               dx/dt * (Q(k, j, i) - Q(k, j-1, i)));
    for (j = 1; j < m-1; j++)
      for (k = 0; k < cell_size;  k++) 
    Q(k, j, i) = Q(k, j, i)  - dt/dx * ((nFx[k][j+1] - nFx[k][j]));

  }

  /* Calculate and update fluxes in the y-direction */
  #pragma omp for
  for (i = 1; i < m; i++) {
    fy(Q, ffy, m, n, i);
    for (j = 1; j < n; j++)
      for (k = 0; k < cell_size; k++)
    nFy[k][j] = 0.5 * ((ffy[k][j-1] + ffy[k][j]) - 
               dy/dt * (Q(k, i, j) - Q(k, i, j -1)));
    for (j = 1; j <  n-1; j++) 
      for (k = 0; k < cell_size; k++)
    Q(k,i,j) = Q(k,i,j) -  dt/dy * ((nFy[k][j+1]  -  nFy[k][j]));
  }

}

函数fxfy非常简单,没有数据依赖性。我无法将de #pragma omp parallel for放在第一个for循环之上,因为有数据竞争但是现在我无法看到如何更改此代码来克服它们。

long int i, j, m, n, M, N;
  char *ptr;
  long int s;
  int flag, verbose;
  double *Q;
  double *x, *y;
  double **ffx, **nFx, **ffy, **nFy;
  double dx, dt, epsi, delta, dy, tend, tmp, stime;
  M = 1000;
  N = 1000;

/* Add two ghost volumes at each side of the domain */
  m = M + 2;
  n = N + 2;

  /* Allocate memory for the domain */
  Q = (double *) malloc(m * n * cell_size *  sizeof(double));

  x = (double *) malloc(m * sizeof(double));
  y = (double *) malloc(n * sizeof(double));    

  /* Allocate memory for fluxes */
  ffx = (double **) malloc(cell_size * sizeof(double *));
  ffy = (double **) malloc(cell_size * sizeof(double *));
  nFx = (double **) malloc(cell_size * sizeof(double *));
  nFy = (double **) malloc(cell_size * sizeof(double *));

  ffx[0] = (double *) malloc(cell_size * m * sizeof(double));
  nFx[0] = (double *) malloc(cell_size * m * sizeof(double));
  ffy[0] = (double *) malloc(cell_size * n * sizeof(double));
  nFy[0] = (double *) malloc(cell_size * n * sizeof(double));