首先,我的英语水平很糟糕,很抱歉如果写得不好......
我正在学习如何使用OpenMP并行化C代码,我试图并行化的算法是浅水方程算法,尽管在最关键的循环中使用简单的#pragma omp parallel for
我已经获得了近似性能提高40%我知道我的实现非常差,而且我不会按照自己的意愿来挤压内核。代码的结构很简单:一个'main',它分配内存并初始化一些矩阵和数组,并调用一个名为solver的函数来完成所有的工作,我把#pragma omp parallel for
放在一边。
我想我可以使用并行部分来提升性能,其中内存被分配和初始化,因此每个线程都拥有所有数据,但是当我运行程序时,我没有任何提升,因为我是这是一个新手,我不知道我的想法是不好还是坏事是我的实施。我会提供一些可以提高算法性能的帮助或提示。这是我的作业,我不希望有人为我做这件事,只是一点帮助可以让我前进......
我会粘贴代码以便更好地理解:
主要功能(分配和初始化)
int main(int argc, char **argv) {
long int i, j, m, n, M, N;
char *ptr;
long int s;
int flag, verbose;
double *Q;
double *x, *y;
double **ffx, **nFx, **ffy, **nFy;
double dx, dt, epsi, delta, dy, tend, tmp, stime;
/* Default values to use: m volumes in the x-direction and n volumes in the y-direction
M = 1000;
N = 1000;
/* create file and verbose flags */
.......
.......
/* Parse command line options */
.......
.......
epsi = 2.0;
delta = 0.5;
dx = (xend - xstart) / (double) M;
dy = (yend - ystart) / (double) N;
dt = dx / sqrt( 9.81 * 5.0);
tend = 0.1;
/* Add two ghost volumes at each side of the domain */
m = M + 2;
n = N + 2;
/* Allocate memory for the domain */
/*HERE IS WHRE I PUT THE PRAGMA FOR PARALLEL INITIALIZATION AND ALLOCATIONS*/
#pragma omp parallel
{
Q = (double *) malloc(m * n * cell_size * sizeof(double));
x = (double *) malloc(m * sizeof(double));
y = (double *) malloc(n * sizeof(double));
/* Allocate memory for fluxes */
ffx = (double **) malloc(cell_size * sizeof(double *));
ffy = (double **) malloc(cell_size * sizeof(double *));
nFx = (double **) malloc(cell_size * sizeof(double *));
nFy = (double **) malloc(cell_size * sizeof(double *));
ffx[0] = (double *) malloc(cell_size * m * sizeof(double));
nFx[0] = (double *) malloc(cell_size * m * sizeof(double));
ffy[0] = (double *) malloc(cell_size * n * sizeof(double));
nFy[0] = (double *) malloc(cell_size * n * sizeof(double));
for (i = 0; i < cell_size; i++) {
ffx[i] = ffx[0] + i * m;
nFx[i] = nFx[0] + i * m;
ffy[i] = ffy[0] + i * n;
nFy[i] = nFy[0] + i * n;
}
for (i = 0,tmp= -dx/2 + xstart; i < m; i++, tmp += dx)
x[i] = tmp;
for (i = 0,tmp= -dy/2 + ystart; i < n; i++, tmp += dy)
y[i] = tmp;
/* Set initial Gauss hump */
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
Q(0, i, j) = 4.0;
Q(1, i, j) = 0.0;
Q(2, i, j) = 0.0;
}
}
for (i = 1; i < m-1; i++) {
for (j = 1; j < n-1; j++) {
Q(0, i, j) = 4.0 + epsi * exp(-(pow(x[i] - xend / 4.0, 2) + pow(y[j] - yend / 4.0, 2)) /
(pow(delta, 2)));
}
}
}
// Record start time
stime = gettime();
/*THIS IS THE FUNCTION WHERE THE 'WORK' IS DONE*/
solver(Q, ffx, ffy, nFx, nFy, m, n, tend, dx, dy, dt);`
}
SOLVER FUNCTION(关键部分)
/*
This is the main solver routine.
*/
void solver(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double tend, double dx, double dy, double dt) {
double bc_mask[3] = {1.0, -1.0, -1.0};
double time;
int i, j, k, steps;
steps = ceil(tend / dt);
for (i = 0, time = 0.0; i < steps; i++, time += dt) {
/* Apply boundary condition */
#pragma omp parallel for private(j) num_threads (NTHR)
for (k = 0; k < cell_size; k++)
{
for (j = 1; j < n - 1 ; j++)
{
Q(k, 0, j) = bc_mask[k] * Q(k, 1, j);
Q(k, m-1, j) = bc_mask[k] * Q(k, m-2, j);
}
}
#pragma omp parallel for private(j) num_threads (NTHR)
for (k = 0; k < cell_size; k++)
{
for (j = 0; j < m; j++)
{
Q(k, j, 0) = bc_mask[k] * Q(k, j, 1);
Q(k, j, n-1) = bc_mask[k] * Q(k, j, n-2);
}
}
/* Update all volumes with the Lax-Friedrich's scheme */
laxf_scheme_2d(Q, ffx, ffy, nFx, nFy, m, n, dx, dy, dt);
}
}
/*
This is the Lax-Friedrich's scheme for updating volumes
*/
void laxf_scheme_2d(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double dx, double dy, double dt) {
int i, j, k;
/* Calculate and update fluxes in the x-direction */
#pragma omp parallel for private(k,j) num_threads (NTHR)
for (i = 1; i < n; i++) {
fx(Q, ffx, m, n, i);
for (k = 0; k < cell_size; k++)
for (j = 1; j < m; j++)
nFx[k][j] = 0.5 * ((ffx[k][j-1] + ffx[k][j]) - dx/dt * (Q(k, j, i) - Q(k, j-1, i)));
for (k = 0; k < cell_size; k++)
for (j = 1; j < m-1; j++)
Q(k, j, i) = Q(k, j, i) - dt/dx * ((nFx[k][j+1] - nFx[k][j]));
}
/* Calculate and update fluxes in the y-direction */
#pragma omp parallel for private(k,j) num_threads (NTHR)
for (i = 1; i < m; i++) {
fy(Q, ffy, m, n, i);
for (k = 0; k < cell_size; k++)
for (j = 1; j < n; j++)
nFy[k][j] = 0.5 * ((ffy[k][j-1] + ffy[k][j]) - dy/dt * (Q(k, i, j) - Q(k, i, j -1)));
for (k = 0; k < cell_size; k++)
for (j = 1; j < n-1; j++)
Q(k,i,j) = Q(k,i,j) - dt/dy * ((nFy[k][j+1] - nFy[k][j]));
}
}
据我所知,求解器函数的循环和它的子函数没有数据依赖关系,并且由于在分配中放置并行区域并且数据初始化什么也没做,我不知道如何继续。
提前致谢!
答案 0 :(得分:0)
您的代码存在多个问题。首先,您有数据竞争,因为您通过所有线程写入共享变量,例如Q
,x
和y
。可以在并行区域之外进行分配,也可以仅通过一个线程执行它们(#pragma omp master
或#pragma omp single
)。
然后,您不必在初始化部分中并行化for
循环。实际上,所有这些循环都是由整个范围内的所有线程执行的(同样是数据争用并且可能存在大量的缓存争用)。您应该将#pragma omp parallel
添加到这些循环中。对于嵌套循环,collapse
指令可能很有用。
另外,请确保solver()
和laxf_scheme_2d()
函数中没有数据争用。看起来,计算的大部分时间是在laxf_scheme_2d()
内花费,但是,这个函数根本不是并行运行的。它是否在内部使用OpenMP?
答案 1 :(得分:0)
感谢您的回答。我在实施过程中遇到了很多问题,首先是所有工作完成的最重要的功能是laxf_scheme_2d
。
关于Q
变量,我有#define Q(i, j, k) Q[((k) + n * ((j) + m * (i)))]
这是laxf_scheme_2d
void laxf_scheme_2d(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double dx, double dy, double dt) {
int i, j, k;
/* Calculate and update fluxes in the x-direction */
#pragma omp for
for (i = 1; i < n; i++) {
fx(Q, ffx, m, n, i);
for (j = 1; j < m; j++)
for (k = 0; k < cell_size; k++)
nFx[k][j] = 0.5 * ((ffx[k][j-1] + ffx[k][j]) -
dx/dt * (Q(k, j, i) - Q(k, j-1, i)));
for (j = 1; j < m-1; j++)
for (k = 0; k < cell_size; k++)
Q(k, j, i) = Q(k, j, i) - dt/dx * ((nFx[k][j+1] - nFx[k][j]));
}
/* Calculate and update fluxes in the y-direction */
#pragma omp for
for (i = 1; i < m; i++) {
fy(Q, ffy, m, n, i);
for (j = 1; j < n; j++)
for (k = 0; k < cell_size; k++)
nFy[k][j] = 0.5 * ((ffy[k][j-1] + ffy[k][j]) -
dy/dt * (Q(k, i, j) - Q(k, i, j -1)));
for (j = 1; j < n-1; j++)
for (k = 0; k < cell_size; k++)
Q(k,i,j) = Q(k,i,j) - dt/dy * ((nFy[k][j+1] - nFy[k][j]));
}
}
函数fx
和fy
非常简单,没有数据依赖性。我无法将de #pragma omp parallel for
放在第一个for
循环之上,因为有数据竞争但是现在我无法看到如何更改此代码来克服它们。
long int i, j, m, n, M, N;
char *ptr;
long int s;
int flag, verbose;
double *Q;
double *x, *y;
double **ffx, **nFx, **ffy, **nFy;
double dx, dt, epsi, delta, dy, tend, tmp, stime;
M = 1000;
N = 1000;
/* Add two ghost volumes at each side of the domain */
m = M + 2;
n = N + 2;
/* Allocate memory for the domain */
Q = (double *) malloc(m * n * cell_size * sizeof(double));
x = (double *) malloc(m * sizeof(double));
y = (double *) malloc(n * sizeof(double));
/* Allocate memory for fluxes */
ffx = (double **) malloc(cell_size * sizeof(double *));
ffy = (double **) malloc(cell_size * sizeof(double *));
nFx = (double **) malloc(cell_size * sizeof(double *));
nFy = (double **) malloc(cell_size * sizeof(double *));
ffx[0] = (double *) malloc(cell_size * m * sizeof(double));
nFx[0] = (double *) malloc(cell_size * m * sizeof(double));
ffy[0] = (double *) malloc(cell_size * n * sizeof(double));
nFy[0] = (double *) malloc(cell_size * n * sizeof(double));