为了教育目的,我修改了我之前写过的光线跟踪器,以利用OpenMP进行多处理。但是,我没有从并行化中看到任何利润。
我尝试了3种不同的方法:任务池环境(draw_pooled()
函数),标准OMP并行嵌套for
循环,具有图像行级并行性(draw_parallel_for()
) ,以及另一个具有像素级并行度(for
)的OMP并行draw_parallel_for2()
。还包括原始的串行绘图例程以供参考(draw_serial()
)。
我正在英特尔酷睿2双核E6750上运行2560x1920渲染(2核@ 2,67GHz,每条超线程)和Linux下4GB内存,由gcc和libgomp编译的二进制文件。场景平均为:
为什么会这样?我看不出并行代码中存在任何明显的瓶颈。
编辑:为了澄清 - 任务池只有其中一个实现,请阅读问题 - 向下滚动以查看并行for
秒。事情是,它们和任务池一样慢!
void draw_parallel_for(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
for (int y = 0; y < h; ++y) {
#pragma omp parallel for num_threads(4)
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_parallel_for2(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
int x, y;
#pragma omp parallel for private(x, y) num_threads(4)
for (int xy = 0; xy < w * h; ++xy) {
x = xy % w;
y = xy / w;
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_parallel_for3(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
#pragma omp parallel for num_threads(4)
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_serial(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
std::queue< std::pair<int, int> * > task_queue;
void draw_pooled(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
bool tasks_issued = false;
#pragma omp parallel shared(buf, tasks_issued, w, h) num_threads(4)
{
#pragma omp master
{
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
task_queue.push(new std::pair<int, int>(x, y));
}
tasks_issued = true;
}
while (true) {
std::pair<int, int> *coords;
#pragma omp critical(task_fetch)
{
if (task_queue.size() > 0) {
coords = task_queue.front();
task_queue.pop();
} else
coords = NULL;
}
if (coords != NULL) {
Scene::GetInstance().RenderPixel(coords->first, coords->second,
buf + (coords->second * w + coords->first) * 3);
delete coords;
} else {
#pragma omp flush(tasks_issued)
if (tasks_issued)
break;
}
}
}
write_png(buf, w, h, fname);
delete [] buf;
}
答案 0 :(得分:3)
你最里面的循环中有一个关键部分。换句话说,您正在按每个像素点击同步原语 。这会破坏性能。
最好在切片中拆分场景,并在每个线程上工作一个。这样,您在同步之间有更长的时间(整个图块的处理价值)。
答案 1 :(得分:0)
如果像素是独立的,则实际上不需要任何锁定。您可以将图像分成行或列,让线程自行工作。例如,您可以让每个线程在每第n行(伪代码)上运行:
for(int y = TREAD_NUM; y < h; y += THREAD_COUNT)
for(int x = 0; x < w; ++x)
render_pixel(x,y);
其中THREAD_NUM是每个线程的唯一编号,0 <= THREAD_NUM < THREAD_COUNT
。然后在之后加入你的线程池,执行png转换。
答案 2 :(得分:0)
创建线程时始终存在性能开销。在for循环中的OMP并行显然会产生大量开销。例如,在您的代码中
void draw_parallel_for(int w, int h, const char *fname) {
for (int y = 0; y < h; ++y) {
// Here There is a lot of overhead
#pragma omp parallel for num_threads(4)
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
}
可以重写为
void draw_parallel_for(int w, int h, const char *fname) {
#pragma omp parallel for num_threads(4)
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
}
或
void draw_parallel_for(int w, int h, const char *fname) {
#pragma omp parallel num_threads(4)
for (int y = 0; y < h; ++y) {
#pragma omp for
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
}
通过这种方式,您将消除开销