Question

我试图在一段时间内做一个平行的事情，像这样的事情：

while(!End){
    for(...;...;...) // the parallel for

    ...
    // serial code
}

for循环是while循环的唯一并行部分。如果我这样做，我会有很多开销：

cycles = 0;
while(!End){ // 1k Million iterations aprox
    #pragma omp parallel for
    for(i=0;i<N;i++) // the parallel for with 256 iteration aprox
        if(time[i] == cycles){
           if (wbusy[i]){
               wbusy[i] = 0;
               wfinished[i] = 1;
           }
        }


    // serial code
    ++cycles;    

}

for循环的每次迭代都是相互独立的。

串行代码和并行代码之间存在依赖关系。

Answer 1

因此，通常人们不必过于担心将并行区域放入循环中，因为现代openmp实现对于使用线程团队之类的东西非常有效，只要有很多工作在循环你很好。但是在这里，外部循环计数为~1e9，内部循环计数为~256 - 每次迭代完成的工作很少 - 开销可能与正在完成的工作量相当或更差，性能也会受到影响。 / p>

所以这之间会有明显的区别：

cycles = 0;
while(!End){ // 1k Million iterations aprox
    #pragma omp parallel for
    for(i=0;i<N;i++) // the parallel for with 256 iteration aprox
        if(time[i] == cycles){
           if (wbusy[i]){
               wbusy[i] = 0;
               wfinished[i] = 1;
           }
        } 

    // serial code
    ++cycles;    
}

和此：

cycles = 0;
#pragma omp parallel
while(!End){ // 1k Million iterations aprox
    #pragma omp for
    for(i=0;i<N;i++) // the parallel for with 256 iteration aprox
        if(time[i] == cycles){
           if (wbusy[i]){
               wbusy[i] = 0;
               wfinished[i] = 1;
           }
        } 

    // serial code
    #pragma omp single 
    {
      ++cycles;    
    }
}

但实际上，不幸的是，每次迭代时扫描时间数组都是（a）速度慢和（b）没有足够的工作来保持多个核心繁忙 - 它是内存密集型的。有了几个线程，你实际上会比串行线程具有更差的性能，即使没有开销，也只是因为存储器争用。不可否认，您在此处发布的内容只是一个示例，而不是您的真实代码，但为什么不预先处理时间数组，以便您可以检查下一个任务何时可以更新：

#include <stdio.h>
#include <stdlib.h>

struct tasktime_t {
    long int time;
    int task;
};

int stime_compare(const void *a, const void *b) {
    return ((struct tasktime_t *)a)->time - ((struct tasktime_t *)b)->time;
}

int main(int argc, char **argv) {
    const int n=256;
    const long int niters = 100000000l;
    long int time[n];
    int wbusy[n];
    int wfinished[n];

    for (int i=0; i<n; i++) {
        time[i] = rand() % niters;
        wbusy[i] = 1;
        wfinished[i] = 0;
    }

    struct tasktime_t stimes[n];

    for (int i=0; i<n; i++) {
        stimes[i].time = time[i];
        stimes[i].task = i;
    }

    qsort(stimes, n, sizeof(struct tasktime_t), stime_compare);

    long int cycles = 0;
    int next = 0;
    while(cycles < niters){ // 1k Million iterations aprox
        while ( (next < n) && (stimes[next].time == cycles) ) {
           int i = stimes[next].task;
           if (wbusy[i]){
               wbusy[i] = 0;
               wfinished[i] = 1;
           }
           next++;
        }

        ++cycles;
    }

    return 0;
}

这比扫描方法的串行版本快〜5倍（并且比OpenMP版本更快）。即使您不断更新串行代码中的时间/ wbusy / wfinished数组，也可以使用priority queue跟踪其完成时间，每次更新需要O（ln（N））时间而不是每次迭代扫描花费O（N）时间。

在C上使用OpenMP并行一段时间

1 个答案: