Question

我尝试计算并行简单代码的加速。这是一个简单的循环。首先，我在c ++中使用open-mp来并行化它。然后我想找到每个线程的执行时间，我使用最大线程时间作为并行执行时间。我用不同的线程号重复它，但时间更糟！你可以帮帮我吗？

#include "stdafx.h"
#include "omp.h"
#include "conio.h"
double diftime[64];
int a,i,threadnum;
int main()
{
threadnum=2;
omp_set_nested(1);
omp_set_dynamic(0);
#pragma omp parallel num_threads(threadnum) 
{
    double start_time,end_time;
    int id = omp_get_thread_num();
    start_time=omp_get_wtime();
    #pragma omp for nowait schedule(static)
    for (i=0;i<2000000;i++){a++;}
    end_time=omp_get_wtime();
    diftime[id]=diftime[id]+(end_time-start_time);      
    printf("thread[%d] = %.32g\n",id,end_time-start_time);  
}
getch();
return 0;
}

Answer 1

原因是你的循环操作非常简单，编译器在循环后用a的结果替换循环。看一下这个例子：

#include <stdio.h>

int main()
{
   size_t i;
   unsigned a = 0;
   for (i = 0; i < (1UL << 20); i++) // the loop should run 1048576 times
      a++;
   printf("%ud\n", a);
   return 0;
}

但是当我们通过gcc -O2 -S test.c查看生成的说明时，我们会找到

_main:
LFB20:
   subq   $8, %rsp
LCFI0:
   movl   $1048576, %esi  # the loop is replaced by a's value!
   xorl   %eax, %eax
   leaq   LC0(%rip), %rdi
   call   _printf
   xorl   %eax, %eax
   addq   $8, %rsp
LCFI1:
   ret

所以，你的测量时间上升的原因是它需要更多的时间来生成和处理更多的线程（它们什么都不做）。

如果要强制编译器创建循环，则应该在循环volatile中创建变量，如下所示：

#include <stdio.h>

#include <omp.h>

double diftime[64];
int main()
{
   int i;
   unsigned a = 0;
#pragma omp parallel
   {
      double start_time, end_time;
      int id = omp_get_thread_num();
      start_time = omp_get_wtime();
      volatile int b = 0;  // #############################
#pragma omp for nowait schedule(static)
      for (i = 0; i < (1UL << 20); i++)
         b++;
      end_time = omp_get_wtime();
      diftime[id] = diftime[id] + (end_time - start_time);
      printf("thread[%d] = %.32g\n", id, end_time - start_time);
// ensure only one thread at a time executes the next line
#pragma omp critical  
      a += b;
   }
   printf("a = %d\n", a);
   return 0;
}

为什么在我的openmp代码中增加执行时间？

1 个答案: