Question

最近，我正在研究执行相似性调度的c OpenMP代码。基本上，一个线程完成其分配的迭代之后，它将开始寻找工作量最大的其他线程，并从中窃取一些工作。

一切正常，我可以使用icc编译文件。但是，当我尝试运行它时，它给了我分割错误（核心转储）。但是有趣的是，错误并非总是会发生，也就是说，即使在我第一次运行代码时遇到错误，当我再次尝试运行时，有时它也会起作用。这对我来说很奇怪。我想知道我的代码做错了什么以及如何解决该问题。谢谢。我只是修改了方法runloop和affinity，其他方法在开始时就给出了，效果很好。

#include <stdio.h>
#include <math.h>


#define N 729
#define reps 1000 
#include <omp.h> 


double a[N][N], b[N][N], c[N];
int jmax[N];  


void init1(void);
void init2(void);
void runloop(int); 
void loop1chunk(int, int);
void loop2chunk(int, int);
void valid1(void);
void valid2(void);
int affinity(int*, int*, int, int, float, int*, int*);

int main(int argc, char *argv[]) { 

    double start1,start2,end1,end2;
    int r;

  init1(); 

  start1 = omp_get_wtime(); 

  for (r=0; r<reps; r++){ 
    runloop(1);
  } 

  end1  = omp_get_wtime();  

  valid1(); 

  printf("Total time for %d reps of loop 1 = %f\n",reps, (float)(end1-start1)); 


  init2(); 

  start2 = omp_get_wtime(); 

  for (r=0; r<reps; r++){ 
    runloop(2);
  } 

  end2  = omp_get_wtime(); 

  valid2(); 

  printf("Total time for %d reps of loop 2 = %f\n",reps, (float)(end2-start2)); 

} 

void init1(void){
  int i,j; 

  for (i=0; i<N; i++){ 
    for (j=0; j<N; j++){ 
      a[i][j] = 0.0; 
      b[i][j] = 3.142*(i+j); 
    }
  }

}

void init2(void){ 
  int i,j, expr; 

  for (i=0; i<N; i++){ 
    expr =  i%( 3*(i/30) + 1); 
    if ( expr == 0) { 
      jmax[i] = N;
    }
    else {
      jmax[i] = 1; 
    }
    c[i] = 0.0;
  }

  for (i=0; i<N; i++){ 
    for (j=0; j<N; j++){ 
      b[i][j] = (double) (i*j+1) / (double) (N*N); 
    }
  }

}  


void runloop(int loopid)
{
  int nthreads = omp_get_max_threads(); // we set it before the parallel region, using opm_get_num_threads() will always return 1 otherwise                             
    int ipt = (int) ceil((double)N/(double)nthreads);           
    float chunks_fraction = 1.0 / nthreads;                                 
    int threads_lo_bound[nthreads];                                         
    int threads_hi_bound[nthreads];                                     

    #pragma omp parallel default(none) shared(threads_lo_bound, threads_hi_bound, nthreads, loopid, ipt, chunks_fraction)
    {
        int myid = omp_get_thread_num();
        int lo = myid * ipt;
    int hi = (myid+1)*ipt;
    if (hi > N) hi = N; 

        threads_lo_bound[myid] = lo;
        threads_hi_bound[myid] = hi;

        int current_lower_bound = 0;
        int current_higher_bound = 0;
        int affinity_steal = 0;

        while(affinity_steal != -1)
        {
            switch(loopid)
            {
                case 1: loop1chunk(current_lower_bound, current_higher_bound); break;
                case 2: loop2chunk(current_lower_bound, current_higher_bound); break;
            }

            #pragma omp critical
            {
                affinity_steal = affinity(threads_lo_bound, threads_hi_bound, nthreads, myid, chunks_fraction, &current_lower_bound, &current_higher_bound);
            }
        }
    }
}

int affinity(int* threads_lo_bound, int* threads_hi_bound, int num_of_thread, int thread_num, float chunks_fraction, int *current_lower_bound, int *current_higher_bound)
{
    int current_pos;

    if (threads_hi_bound[thread_num] - threads_lo_bound[thread_num] > 0)
    {
        current_pos = thread_num;
    }
    else
    {
        int new_pos = -1;
        int jobs_remain = 0;
    int i;
        for (i = 0; i < num_of_thread; i++)
        {
            int diff = threads_hi_bound[i] - threads_lo_bound[i];
            if (diff > jobs_remain)
            {
                new_pos = i;
                jobs_remain = diff;
            }
        }

        current_pos = new_pos;
    }

    if (current_pos == -1) return -1;

    int remaining_iterations = threads_hi_bound[current_pos] - threads_lo_bound[current_pos];
    int iter_size_fractions = (int)ceil(chunks_fraction * remaining_iterations);

    *current_lower_bound = threads_lo_bound[current_pos];
    *current_higher_bound = threads_lo_bound[current_pos] + iter_size_fractions;
    threads_lo_bound[current_pos] = threads_lo_bound[current_pos] + iter_size_fractions;

    return current_pos;
}

void loop1chunk(int lo, int hi) { 
  int i,j; 

  for (i=lo; i<hi; i++){ 
    for (j=N-1; j>i; j--){
      a[i][j] += cos(b[i][j]);
    } 
  }

} 


void loop2chunk(int lo, int hi) {
  int i,j,k; 
  double rN2; 

  rN2 = 1.0 / (double) (N*N);  

  for (i=lo; i<hi; i++){ 
    for (j=0; j < jmax[i]; j++){
      for (k=0; k<j; k++){ 
    c[i] += (k+1) * log (b[i][j]) * rN2;
      } 
    }
  }

}

void valid1(void) { 
  int i,j; 
  double suma; 

  suma= 0.0; 
  for (i=0; i<N; i++){ 
    for (j=0; j<N; j++){ 
      suma += a[i][j];
    }
  }
  printf("Loop 1 check: Sum of a is %lf\n", suma);

} 


void valid2(void) { 
  int i; 
  double sumc; 

  sumc= 0.0; 
  for (i=0; i<N; i++){ 
    sumc += c[i];
  }
  printf("Loop 2 check: Sum of c is %f\n", sumc);
}

Answer 1

您无需初始化数组threads_lo_bound和threads_hi_bound，因此它们最初包含一些完全随机的值（这是随机数1的来源）。

然后进入并行区域，这是必须意识到并非所有线程都将同步移动代码的情况，每个线程的实际速度是相当随机的，因为它与许多其他程序共享CPU仅使用1％，它仍然会显示出来（这是2号随机数的来源，我认为这与为什么您不时看到它起作用的原因更相关）。

那么当代码崩溃时会发生什么？

其中一个线程（最有可能是主线程）在其他线程中的至少一个线程到达设置threads_lo_bound[myid]和threads_hi_bound[myid]的行之前到达关键区域。

然后，根据存储在其中的那些随机值（您通常可以假设它们超出范围，您的数组很小，这些值成为有效索引的几率很小），线程将尝试通过将current_lower_bound和/或current_upper_bound设置为超出初始数组a, b, c的某个值来窃取某些作业（不存在）。

然后它将进入while(affinity_steal != -1)循环的第二次迭代并访问内存，这不可避免地导致分段错误（最终，原则上这是未定义的行为，崩溃可能发生在无效的内存访问，或者在某些情况下从没有，导致您相信一切都正常，而最肯定的是没有序。）

解决方法很简单，添加

#pragma omp barrier

恰好在while(affinity_steal != -1)循环之前，以确保所有线程均已到达该点（即，在该点同步线程），并且在进入循环之前已正确设置了边界。这样的开销很小，但是如果出于某些原因希望避免使用障碍，则可以在进入并行区域之前简单地设置数组的值。

也就是说，这样的错误通常可以使用一个好的调试器来定位，我强烈建议学习使用方法，它们使工作变得更加轻松。

OpenMP分段错误

1 个答案: