如何并行嵌套循环

时间:2014-04-26 16:47:39

标签: c++ loops nested openmp

这是用于解决大维度计算数学问题的c ++代码的一部分,比如超过100000个变量。我想使用OpenMP并行化它。通过OpenMP并行以下嵌套循环的最佳方法是什么?

e = 0;
// m and n are are big numbers 200000 - 10000000
int i,k,r,s,t;
// hpk,hqk,pk_x0,n2pk_x0,dk,sk are double and declared before.
for (k=0; k<m; k++) 
{
  hpk     = 0;     
  hqk     = 0;
  n2pk_x0 = 0;
  dk      = 0;
  sk      = 0; 

  for (int i=0; i<n; i++) 
  {
     if (lamb[i] <= lam[k]) 
     {
          if (h[i]<0)
          {
             pk[i] = xu[i];
          }
          else if (h[i]>0)
          {
             pk[i] = xl[i];
          }
          qk[i] = 0;
     }
     else
     {
          pk[i] = x0[i];
          qk[i] = -h[i];
     }

     hpk     += h[i]*pk[i];
     hqk     += h[i]*qk[i];
     pk_x0    = pk[i]-x0[i];
     n2pk_x0 += pk_x0*pk_x0;
     dk      += pk_x0*qk[i];
     sk      += qk[i]*qk[i];
  }
  //}//p

  /* ------- Compute ak, bk, ck, dk and sk to construct e(lam) -------- */
  ak = - (gamma + hpk);
  bk = - hqk;
  ck = q0 + 0.5 * n2pk_x0;
  sk = 0.5 * sk;
  // some calculation based on index k
} // end of first for

我做了一些建议来私有嵌套循环中的局部变量.CPU时间减少了因子1/2,但输出不正确!有没有办法以这样的方式改进代码,以更少的CPU时间获得正确的结果? (在嵌套循环中,如果我们设置m = 1,输出将是正确的,但是对于m> 1,输出是不正确的。)

这是整个代码:

static void subboconcpp(
               double u[],
               double *Egh,
                   double h[],
               double gamma,
                   double x0[],
               double q0,
                   double xl[],
               double xu[],
                   int    dim
               )
{

int    n,m,infinity = INT_MAX,i,k,r,s,t;;
double e; 
double hpk, hqk, dk1, sk1, n2pk_x0;
double ak, bk, ck, dk, sk; 
double lam_hat, phik, ek1, ek2;
double *pk    = new double[dim];
double *qk    = new double[dim]; 
double *lamb  = new double[dim];
double *lamb1 = new double[dim];
double *lam   = new double[dim];

/* ------------------ Computing lambl(i) and lambu(i) ------------------ */
/* n is the length of x0 */   
n = dim;  

#pragma omp parallel for shared(n,h,x0,xl,xu)//num_threads(8)
for (int i=0; i<n; i++) 
{
  double lamb_flag;
  if (h[i] > 0) 
  {
            lamb_flag = (x0[i] - xl[i])/h[i];
            lamb[i]  = lamb_flag;
            lamb1[i] = lamb_flag;
  } 
  else if (h[i] < 0) 
  {
            lamb_flag = (x0[i] - xu[i])/h[i];
            lamb[i]  = lamb_flag;
            lamb1[i] = lamb_flag;
  } 
  //cout << "lamb:" << lamb[i];
}
/* --------------------------------------------------------------------- */

/* ----------------- Sorting lamb and constructing lam ----------------- */

/* lamb = sort(lamb,1); */
sort(lamb1, lamb1+n);

int    q        = 0;
double lam_flag = 0;
#pragma omp parallel for shared(n) firstprivate(q) lastprivate(m)
for (int j=0; j<n; j++) 
{
  if (lamb1[j] > lam_flag) 
  {
     lam_flag = lamb1[j];
     q      = q + 1;
     lam[q] = lam_flag;
     //cout << "lam: \n" << lam[q];
  }

  if (j == n-1)
  {
     if (lam_flag < infinity)
     {
        m = q+1;
        lam[m] = + infinity;
     }
     else
     {
         m = q;
     }
  }
  //cout << "q: \n" << q;
}

/* --------------------------------------------------------------------- */

/* -- Finding the global maximizer of e(lam)  for lam in[-inf, + inf] -- */
e = 0;  

#pragma omp parallel shared(m,n,h,x0,xl,xu,lamb,lam) \
private(i,r,s,t,hpk, hqk, dk1, sk1, n2pk_x0,ak, bk, ck, dk, sk,lam_hat, phik, ek1, ek2) 
{
#pragma omp for nowait

for (k=0; k<1; k++) 
{
  /*double hpk=0, hqk=0, dk1=0, sk1=0, n2pk_x0=0;
  double ak, bk, ck, dk, sk; 
  double lam_hat, phik, ek1, ek2; 
  double *pk = new double[dim];
  double *qk = new double[dim];*/    

  hpk     = 0;     
  hqk     = 0;
  n2pk_x0 = 0;
  dk1     = 0;
  sk1     = 0; 

  for (int i=0; i<n; i++) 
  {
     double pk_x0;
     if (lamb[i] <= lam[k]) 
     {
          if (h[i]<0)
          {
             pk[i] = xu[i];
          }
          else if (h[i]>0)
          {
             pk[i] = xl[i];
          }
          qk[i] = 0;
     }
     else
     {
          pk[i] = x0[i];
          qk[i] = -h[i];
     }

     hpk     += h[i]*pk[i];
     hqk     += h[i]*qk[i];
     pk_x0    = pk[i]-x0[i];
     n2pk_x0 += pk_x0*pk_x0;
     dk1     += pk_x0*qk[i];
     sk1     += qk[i]*qk[i];
  }

  /* ------- Compute ak, bk, ck, dk and sk to construct e(lam) -------- */
  ak = - (gamma + hpk);
  bk = - hqk;
  ck = q0 + 0.5 * n2pk_x0;
  dk = dk1;
  sk = 0.5 * sk1;
  /* ----------------------------------------------------------------- */ 

  /* - Finding the global maximizer of e(lam) for [lam(k), lam(k+1)] - */
  /* --------------------- using Proposition 4 ----------------------- */
  if (bk != 0) 
  {
         double w = ak*ak - bk*(ak*dk - bk*ck)/sk;
         if (w == 0) 
         {
                lam_hat = -ak / bk;
                phik    = 0;
         } 
         else 
         {
                double w = ak*ak - bk*(ak*dk - bk*ck)/sk;
                lam_hat = (-ak + sqrt(w))/bk;
                phik    = bk / (2*sk*lam_hat + dk);  
         }
  } 
  else 
  {
         if (ak > 0) 
         {
                   lam_hat = -dk / (2 * sk);
                   phik    = 4*ak*sk / (4*ck*sk + (sk - 2)*(dk*dk));
         } 
         else 
         {
                   lam_hat = + infinity; 
                   phik    = 0;
         }
  }
  /* ----------------------------------------------------------------- */

  /* --- Checking the feasibility of the solution of Proposition 4 --- */
  if (lam[k] <= lam_hat && lam_hat <= lam[k + 1]) 
  {
         if (phik > e) 
         {
            for (r=0; r<n; r++)
            {
               u[r] = pk[r] + lam_hat * qk[r];
            }

            e = phik;
         }
  } 
  else 
  {
         ek1 = (ak+bk*lam[k])/(ck+(dk+sk*lam[k])*lam[k]);
         ek2 = (ak+bk*lam[k+1])/(ck+(dk+sk*lam[k+1])*lam[k+1]);      
         if (ek1 >= ek2) 
         {
                lam_hat = lam[k];
                if (ek1 > e) 
                {
                   for (s=0; s<n;s++)
                   {
                      u[s] = pk[s] + lam_hat * qk[s];
                   }

                   e = ek1;
                }
         } 
         else 
         { 
                lam_hat = lam[k + 1];
                if (ek2 > e) 
                {
                   for (t=0; t<n;t++)
                   {
                      u[t] = pk[t] + lam_hat * qk[t];
                   } 

                   e = ek2;
                }
         }
  }
  /* ------------------------------------------------------------------ */

}/* ------------------------- End of for (k) --------------------------- */
}//p
/* --------- The global maximizer by searching all m intervals --------- */
*Egh = e;
delete[] pk;
delete[] qk;
delete[] lamb1;
delete[] lamb;
delete[] lam; 

return;
/* --------------------------------------------------------------------- */

}

请注意前两个并行代码运行良好,但只是嵌套循环的输出是正确的。

感谢任何建议或评论。

2 个答案:

答案 0 :(得分:1)

最外层循环:我不知道所有代码,但它看起来像变量hpk,hqk,n2pk_x0,dk,sk应该是私有的。如果你没有将它们指定为私有,它将破坏正确性。

OpenMP并不总是非常适合嵌套并行性。它取决于OpenMP设置,但嵌套循环可以创建p * p线程,其中p是机器的默认并发。如此大规模的超额订购可能会导致严重的性能下降。在大多数情况下,将最外层循环并行化并使嵌套循环保持连续是好的。

嵌套循环并行化的原因之一是实现更好的工作平衡。但是你的情况似乎有平衡的工作,如果你只对最外层的循环进行并行化,你不应该面对工作平衡问题。

但是,如果你仍想要并行化两个循环,我建议使用英特尔TBB而不是OpenMP?您可以将tbb :: parallel_for用于最外层循环,将tbb :: parallel_reduce用于嵌套循环。英特尔TBB为其所有算法使用一个线程池,因此不会导致您的应用程序超额订阅。

[已更新]某些并行化建议:

  1. 在您达到正确性之前,执行时间并不意味着什么。由于正确性修正可以显着改变它(即使在某些情况下更好);
  2. 不要尝试并行化#34;所有并且立即&#34;:尝试逐个循环并行化。当正确性被打破时,将更容易理解;
  3. 不要同时修改共享变量。如果你真的需要它,你应该重新考虑算法并使用特殊结构,如缩减,原子操作,锁/互斥量/信号量等。
  4. 在使用私有修改索引的共享数组中写入时要准确,因为不同的线程可能具有相同的索引。

答案 1 :(得分:0)

我认为您对嵌套并行化的想法并不适合OpenMP思维模式。尽管嵌套并行性可以在OpenMP中实现,但它带来了比必要更多的复杂性。通常在OpenMP中,您只能同时并行一个循环。

应该在具有最小交错依赖性的级别上进行并行化。这通常是最高级别的。在您的特定情况下,这也是正确的,因为外部循环中的步骤不是强耦合的。

我不知道代码的其余部分是做什么的,尤其是hpkhqkn2pk_x0dk和{的价值所发生的变化{1}}。您所要做的就是在代码中添加sk