Question

我有一个生成数据的大循环。每次迭代需要1秒钟，然后产生一大块数据。我需要以正确的顺序写入文件中的所有块。

如果我只想并行化循环，我可以编写类似的东西（高度简化）：

    FILE* f = fopen("output.txt", "w");
    omp_lock_t lock;
    omp_init_lock(&lock);
    int nIterations = 1000000;
#pragma omp parallel for
    for(int thread=0; thread<4; thread++)
    {
        int a=0, b=0, c=0;
        for(int n=thread; n<nIterations; n+=4)
        {
            int value = do_computations(&a, &b, &c);
            omp_set_lock(&lock);
            fprintf(f, "%d\n", value);
            omp_unset_lock(&lock);
        }
    }
#pragma omp barrier
    fclose(f);
    omp_destroy_lock(&lock);

这会将我的输出输入到文件中，但不保证条目的顺序。

我希望同步执行以便所有线程执行其任务，然后主线程写入文件，然后线程恢复。换句话说，我想要这样的事情：

    #pragma omp parallel for
        for(int thread=0; thread<4; thread++)
        {
            int a=0, b=0, c=0;
            int values[4];
            for(int n=thread; n<nIterations; n+=4)
            {
                values[n] = do_computations(&a, &b, &c);
#pragma omp barrier
                if(thread == 0)
                {
                      for(int i=0; i<4; i++)
                        fprintf(f, "%d\n", values[i]);
                }
#pragma omp barrier
            }
        }
#pragma omp barrier

除了出于某种莫名其妙的原因，OpenMP规范禁止这样做。

或者我可以试试

    #pragma omp parallel for
        for(int thread=0; thread<4; thread++)
        {
            int a=0, b=0, c=0;
            for(int n=thread; n<nIterations; n+=4)
            {
                int value = do_computations(&a, &b, &c);
#pragma omp ordered
                {
                    fprintf(f, "%d\n", value);
                }
            }
        }
    #pragma omp barrier
        fclose(f);

但这也不会起作用，因为“带有for构造的循环的迭代......不能执行多个有序指令。”

我不想将代码重写为单个循环，我不想交换循环。

有没有一种干净的方法可以使用OpenMP执行此操作，而无需其他线程/同步工具？

Answer 1

你正在尝试做两件事 - 计算和IO。计算可以并行化，但IO必须是串行的。但是通过将IO放在与计算相同的循环中，您也会强制计算序列化，这没有任何意义。

你做完所有的计算要好得多，然后做IO。即使在串行中，这几乎肯定会更快，特别是如果你可以在一个大块中以二进制形式写出数据，而不是通过fprintfs循环。

    FILE* f = fopen("output.txt", "w");
    const int nIterations = 1000000;
    int values[nIterations];

#pragma omp parallel for 
    for(int n=0; n<niterations; n++)
    {
        int a=0, b=0, c=0;
        values[n] = do_computations(&a, &b, &c);
    }

    for (int n=0; n<niterations; n++)
         fprintf(f,"%d\n", values[n]);

    fclose(f);

当然，这需要更多的内存，但是速度与内存的关系是一种常见的权衡。如果这种权衡的极端不起作用，你总是可以用可调大小的块进行计算：

    const int nIterations = 1000000;
    const int chunkSize   = 10000;
    int values[chunkSize];
    int chunkNum = 0;
    int chunkLeft = chunkSize;

    for (int start = 0; start < nIterations; start+= chunkSize) {

        if (start+chunkSize > nIterations) chunkLeft = nIterations - start;

    #pragma omp parallel for 
        for(int n=start; n<start+chunkLeft; n++)
        {
            int a=0, b=0, c=0;
            values[n-start] = do_computations(&a, &b, &c);
        }

        for (int n=0; n<chunkLeft; n++)
             fprintf(f,"%d\n", values[n]);

    }
    fclose(f);

Answer 2

我会尝试提出以前答案中尚未出现的解决方案：

#include <stdio.h>
#include <assert.h>
#include <unistd.h>

#define NITER 100

int main() {

  FILE * f = fopen("output.bin", "w+");

#pragma omp parallel
  {
#pragma omp for schedule(runtime)
    for (int ii = 0; ii < NITER; ++ii) {    
      sleep(1); // Simulate computation
      printf("%d\n",ii); // Just to be convinced that the loop is not evaluated in serial order
#pragma omp critical (FILEWRITE)
      {
    fseek (f  ,sizeof(ii)*ii,SEEK_SET);
    fwrite(&ii,sizeof(ii),1,f);
      }      
    }
  }

  // Check serially that the file is written in the right order
  fseek(f,0,SEEK_SET);
  int value = -1;
  for (int ii = 0; ii < NITER; ++ii) {        
    fread (&value,sizeof(ii),1,f);    
    assert( value == ii );
  }  

  fclose(f);
  return 0;
}

这种情况仅适用于每个块具有非常明确的大小，以便知道您正在计算哪个迭代，您可以从文件的开头派生它的偏移量。

也就是说，在您提供的代码段中，有许多错误表明您必须查看OpenMP的基础知识。例如：

#pragma omp parallel for
    for(int thread=0; thread<4; thread++)
    { // No need to unroll the loop as OpenMP runtime 
      // map iterations on threads based on the scheduling policy
        int a=0, b=0, c=0;
        for(int n=thread; n<nIterations; n+=4)
        {
            int value = do_computations(&a, &b, &c);
            // No need to use lock, when a critical construct suffices
            omp_set_lock(&lock); 
            fprintf(f, "%d\n", value);
            omp_unset_lock(&lock);
        }
    } // Implicit barrier at the end of the parallel for
#pragma omp barrier 
// Why a barrier when there is only one thread?

Answer 3

参加晚会，但如果有人在这里寻找答案，你需要的是for i in itertools.product(range(1, 10), repeat=22): ... #Don't actually print, that may block your computer for a long time.，但也可以与@ jonathan-dursi讨论。

循环内的OpenMP同步

3 个答案: