这是用于解决大维度计算数学问题的c ++代码的一部分,比如超过100000个变量。我想使用OpenMP并行化它。通过OpenMP并行以下嵌套循环的最佳方法是什么?
e = 0;
// m and n are are big numbers 200000 - 10000000
int i,k,r,s,t;
// hpk,hqk,pk_x0,n2pk_x0,dk,sk are double and declared before.
for (k=0; k<m; k++)
{
hpk = 0;
hqk = 0;
n2pk_x0 = 0;
dk = 0;
sk = 0;
for (int i=0; i<n; i++)
{
if (lamb[i] <= lam[k])
{
if (h[i]<0)
{
pk[i] = xu[i];
}
else if (h[i]>0)
{
pk[i] = xl[i];
}
qk[i] = 0;
}
else
{
pk[i] = x0[i];
qk[i] = -h[i];
}
hpk += h[i]*pk[i];
hqk += h[i]*qk[i];
pk_x0 = pk[i]-x0[i];
n2pk_x0 += pk_x0*pk_x0;
dk += pk_x0*qk[i];
sk += qk[i]*qk[i];
}
//}//p
/* ------- Compute ak, bk, ck, dk and sk to construct e(lam) -------- */
ak = - (gamma + hpk);
bk = - hqk;
ck = q0 + 0.5 * n2pk_x0;
sk = 0.5 * sk;
// some calculation based on index k
} // end of first for
我做了一些建议来私有嵌套循环中的局部变量.CPU时间减少了因子1/2,但输出不正确!有没有办法以这样的方式改进代码,以更少的CPU时间获得正确的结果? (在嵌套循环中,如果我们设置m = 1,输出将是正确的,但是对于m> 1,输出是不正确的。)
这是整个代码:
static void subboconcpp(
double u[],
double *Egh,
double h[],
double gamma,
double x0[],
double q0,
double xl[],
double xu[],
int dim
)
{
int n,m,infinity = INT_MAX,i,k,r,s,t;;
double e;
double hpk, hqk, dk1, sk1, n2pk_x0;
double ak, bk, ck, dk, sk;
double lam_hat, phik, ek1, ek2;
double *pk = new double[dim];
double *qk = new double[dim];
double *lamb = new double[dim];
double *lamb1 = new double[dim];
double *lam = new double[dim];
/* ------------------ Computing lambl(i) and lambu(i) ------------------ */
/* n is the length of x0 */
n = dim;
#pragma omp parallel for shared(n,h,x0,xl,xu)//num_threads(8)
for (int i=0; i<n; i++)
{
double lamb_flag;
if (h[i] > 0)
{
lamb_flag = (x0[i] - xl[i])/h[i];
lamb[i] = lamb_flag;
lamb1[i] = lamb_flag;
}
else if (h[i] < 0)
{
lamb_flag = (x0[i] - xu[i])/h[i];
lamb[i] = lamb_flag;
lamb1[i] = lamb_flag;
}
//cout << "lamb:" << lamb[i];
}
/* --------------------------------------------------------------------- */
/* ----------------- Sorting lamb and constructing lam ----------------- */
/* lamb = sort(lamb,1); */
sort(lamb1, lamb1+n);
int q = 0;
double lam_flag = 0;
#pragma omp parallel for shared(n) firstprivate(q) lastprivate(m)
for (int j=0; j<n; j++)
{
if (lamb1[j] > lam_flag)
{
lam_flag = lamb1[j];
q = q + 1;
lam[q] = lam_flag;
//cout << "lam: \n" << lam[q];
}
if (j == n-1)
{
if (lam_flag < infinity)
{
m = q+1;
lam[m] = + infinity;
}
else
{
m = q;
}
}
//cout << "q: \n" << q;
}
/* --------------------------------------------------------------------- */
/* -- Finding the global maximizer of e(lam) for lam in[-inf, + inf] -- */
e = 0;
#pragma omp parallel shared(m,n,h,x0,xl,xu,lamb,lam) \
private(i,r,s,t,hpk, hqk, dk1, sk1, n2pk_x0,ak, bk, ck, dk, sk,lam_hat, phik, ek1, ek2)
{
#pragma omp for nowait
for (k=0; k<1; k++)
{
/*double hpk=0, hqk=0, dk1=0, sk1=0, n2pk_x0=0;
double ak, bk, ck, dk, sk;
double lam_hat, phik, ek1, ek2;
double *pk = new double[dim];
double *qk = new double[dim];*/
hpk = 0;
hqk = 0;
n2pk_x0 = 0;
dk1 = 0;
sk1 = 0;
for (int i=0; i<n; i++)
{
double pk_x0;
if (lamb[i] <= lam[k])
{
if (h[i]<0)
{
pk[i] = xu[i];
}
else if (h[i]>0)
{
pk[i] = xl[i];
}
qk[i] = 0;
}
else
{
pk[i] = x0[i];
qk[i] = -h[i];
}
hpk += h[i]*pk[i];
hqk += h[i]*qk[i];
pk_x0 = pk[i]-x0[i];
n2pk_x0 += pk_x0*pk_x0;
dk1 += pk_x0*qk[i];
sk1 += qk[i]*qk[i];
}
/* ------- Compute ak, bk, ck, dk and sk to construct e(lam) -------- */
ak = - (gamma + hpk);
bk = - hqk;
ck = q0 + 0.5 * n2pk_x0;
dk = dk1;
sk = 0.5 * sk1;
/* ----------------------------------------------------------------- */
/* - Finding the global maximizer of e(lam) for [lam(k), lam(k+1)] - */
/* --------------------- using Proposition 4 ----------------------- */
if (bk != 0)
{
double w = ak*ak - bk*(ak*dk - bk*ck)/sk;
if (w == 0)
{
lam_hat = -ak / bk;
phik = 0;
}
else
{
double w = ak*ak - bk*(ak*dk - bk*ck)/sk;
lam_hat = (-ak + sqrt(w))/bk;
phik = bk / (2*sk*lam_hat + dk);
}
}
else
{
if (ak > 0)
{
lam_hat = -dk / (2 * sk);
phik = 4*ak*sk / (4*ck*sk + (sk - 2)*(dk*dk));
}
else
{
lam_hat = + infinity;
phik = 0;
}
}
/* ----------------------------------------------------------------- */
/* --- Checking the feasibility of the solution of Proposition 4 --- */
if (lam[k] <= lam_hat && lam_hat <= lam[k + 1])
{
if (phik > e)
{
for (r=0; r<n; r++)
{
u[r] = pk[r] + lam_hat * qk[r];
}
e = phik;
}
}
else
{
ek1 = (ak+bk*lam[k])/(ck+(dk+sk*lam[k])*lam[k]);
ek2 = (ak+bk*lam[k+1])/(ck+(dk+sk*lam[k+1])*lam[k+1]);
if (ek1 >= ek2)
{
lam_hat = lam[k];
if (ek1 > e)
{
for (s=0; s<n;s++)
{
u[s] = pk[s] + lam_hat * qk[s];
}
e = ek1;
}
}
else
{
lam_hat = lam[k + 1];
if (ek2 > e)
{
for (t=0; t<n;t++)
{
u[t] = pk[t] + lam_hat * qk[t];
}
e = ek2;
}
}
}
/* ------------------------------------------------------------------ */
}/* ------------------------- End of for (k) --------------------------- */
}//p
/* --------- The global maximizer by searching all m intervals --------- */
*Egh = e;
delete[] pk;
delete[] qk;
delete[] lamb1;
delete[] lamb;
delete[] lam;
return;
/* --------------------------------------------------------------------- */
}
请注意前两个并行代码运行良好,但只是嵌套循环的输出是正确的。
感谢任何建议或评论。
答案 0 :(得分:1)
最外层循环:我不知道所有代码,但它看起来像变量hpk,hqk,n2pk_x0,dk,sk应该是私有的。如果你没有将它们指定为私有,它将破坏正确性。
OpenMP并不总是非常适合嵌套并行性。它取决于OpenMP设置,但嵌套循环可以创建p * p线程,其中p是机器的默认并发。如此大规模的超额订购可能会导致严重的性能下降。在大多数情况下,将最外层循环并行化并使嵌套循环保持连续是好的。
嵌套循环并行化的原因之一是实现更好的工作平衡。但是你的情况似乎有平衡的工作,如果你只对最外层的循环进行并行化,你不应该面对工作平衡问题。
但是,如果你仍想要并行化两个循环,我建议使用英特尔TBB而不是OpenMP?您可以将tbb :: parallel_for用于最外层循环,将tbb :: parallel_reduce用于嵌套循环。英特尔TBB为其所有算法使用一个线程池,因此不会导致您的应用程序超额订阅。
[已更新]某些并行化建议:
答案 1 :(得分:0)
我认为您对嵌套并行化的想法并不适合OpenMP思维模式。尽管嵌套并行性可以在OpenMP中实现,但它带来了比必要更多的复杂性。通常在OpenMP中,您只能同时并行一个循环。
应该在具有最小交错依赖性的级别上进行并行化。这通常是最高级别的。在您的特定情况下,这也是正确的,因为外部循环中的步骤不是强耦合的。
我不知道代码的其余部分是做什么的,尤其是hpk
,hqk
,n2pk_x0
,dk
和{的价值所发生的变化{1}}。您所要做的就是在代码中添加sk
。