将这个嵌套循环与携带的依赖项并行化的最佳方法是什么? 知道它是在我从main调用n次的函数内部。
[编辑]
funct(unsigned char*** grid, int n) {
# pragma omp parallel for num_threads(thread_count) default(none) \
shared(grid, n, cur) private(i, j)
for(i = 1; i <= n+1; i++) {
for(j = 1; j <= n; j++) {
if(grid[cur][i-1][j] == 2 && grid[cur][i][j] == 0) {
grid[1-cur][i-1][j] = 0;
grid[1-cur][i][j] = 2;
}
else {
grid[1-cur][i][j] = grid[cur][i][j];
}
}
}
}
主:
cur = 0;
for(s = 0; s < steps; s++) {
funct(grid, N);
cur = 1-cur;
funct_2(grid, N)
cur = 1-cur
}
此代码运行时没有错误但返回错误的结果(.ppm文件)。
答案 0 :(得分:1)
循环嵌套优化/并行化是我博士研究的主题。下面的代码是使用我自己的优化编译器自动生成的。它涉及平铺变换以及并行执行提取的(平铺)无同步切片。
#define min(x,y) ((x) < (y) ? (x) : (y))
#define floord(n,d) (((n)<0) ? -((-(n)+(d)-1)/(d)) : (n)/(d))
#pragma scop
#pragma omp parallel for
for (register int ir0 = 0; ir0 <= floord(n, 32); ir0 += 1) {
for (register int ir1 = 0; ir1 <= floord(n - 1, 32); ir1 += 1) {
if (ir0 == 0) {
for (register int i1 = 32 * ir1 + 1; i1 <= min(n, 32 * ir1 + 32); i1 += 1) {
grid[-cur + 1][0][i1] = (((grid[cur][0][i1] == 2) && (grid[cur][1][i1] == 0)) ? 0 : grid[-cur + 1][0][i1]);
}
}
if (n >= 32 * ir0 + 1) {
for (register int ii0 = ir0; ii0 <= min(ir0 + 1, n / 32); ii0 += 1) {
for (register int i0 = 32 * ii0 + 1; i0 <= min(n + 1, 31 * ir0 + ii0 + 32); i0 += 1) {
for (register int i1 = 32 * ir1 + 1; i1 <= min(n, 32 * ir1 + 32); i1 += 1) {
if (i0 >= 32 * ir0 + 2) {
grid[-cur + 1][i0 - 1][i1] = (((grid[cur][i0 - 1][i1] == 2) && (grid[cur][i0][i1] == 0)) ? 0 : grid[-cur + 1][i0 - 1][i1]);
}
if (ii0 == ir0) {
grid[-cur + 1][i0][i1] = (((grid[cur][i0 - 1][i1] == 2) && (grid[cur][i0][i1] == 0)) ? 2 : grid[cur][i0][i1]);
}
}
}
}
} else {
for (register int i1 = 32 * ir1 + 1; i1 <= 32 * ir1 + 32; i1 += 1) {
grid[-cur + 1][n + 1][i1] = (((grid[cur][n][i1] == 2) && (grid[cur][n + 1][i1] == 0)) ? 2 : grid[cur][n + 1][i1]);
}
}
}
}
#pragma endscop
我测试了代码,其中8个线程在grid[2][N][N]
上运行,其中N = {2500,5500}。与原始代码的串行执行相比,我获得了3倍的加速。