循环携带依赖“ - >”阻止并行化

时间:2017-12-21 22:38:42

标签: c++ openacc pgi

我有一个Model类,它保存模型的数据并在该数据上运行多个函数。细节可能不太重要,除了它有以下设计:

  • 变量存储在类名称空间中。
  • 变量被类的方法之一初始化并释放。
  • 变量由其他几种方法使用。

该类的MWE如下所示:

#include <cstdlib>


class Model {
 private:
  int width;
  int height;
  int size;

  int    nshift[8];      //Offset from a focal cell's index to its neighbours
  double *restrict h;    //Digital elevation model (height)
  int    *restrict rec;  //Index of receiving cell

  const int NO_FLOW  = -1;
  const double SQRT2 = 1.414213562373095048801688724209698078569671875376948;
  const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2};  

 private:
  void GenerateRandomTerrain(){
    //srand(std::random_device()());
    for(int y=0;y<height;y++)
    for(int x=0;x<width;x++){
      const int c = y*width+x;
      h[c]  = rand()/(double)RAND_MAX;
    }
  }  


 public:
  Model(const int width0, const int height0)
    : nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1}
  {
    width  = width0;
    height = height0;
    size   = width*height;

    h      = new double[size];

    GenerateRandomTerrain();
  }

  ~Model(){
    delete[] h;
  }

 private:
  void FindDownstream(){
    //! computing receiver array
    #pragma acc parallel loop collapse(2) independent present(h,rec,width,height)
    for(int y=2;y<height-2;y++)
    for(int x=2;x<width-2;x++){
      const int c      = y*width+x;

      //The slope must be greater than zero for there to be downhill flow;
      //otherwise, the cell is marekd NO_FLOW
      double max_slope = 0;
      int    max_n     = NO_FLOW;

      #pragma acc loop seq
      for(int n=0;n<8;n++){
        double slope = (h[c] - h[c+nshift[n]])/dr[n];
        if(slope>max_slope){
          max_slope = slope;
          max_n     = n;
        }
      }
      rec[c] = max_n;
    }    
  }

 public:
  void run(const int nstep){
    rec    = new int[size];

    #pragma acc enter data copyin(h[0:size],nshift[0:8],height,width,this) create(rec[0:size])

    for(int step=0;step<=nstep;step++)
      FindDownstream();

    #pragma acc exit data copyout(h[0:size]) delete(this,rec)

    delete[] rec;
  }

};

int main(int argc, char **argv){
  Model model(300,300);
  model.run(100);

  return 0;
}

当我编译:

pgc++ -acc -ta=tesla,pinned,cc60 -Minfo=accel  -fast test.cpp -std=c++11

我收到以下警告:

 51, Loop without integer trip count will be executed in sequential mode
     Complex loop carried dependence of rec->,nshift prevents parallelization
     Loop carried dependence of rec-> prevents parallelization
     Loop carried backward dependence of rec-> prevents vectorization

互联网上的一些挖掘表明,造成这种情况的典型原因是指针别名可能会导致依赖性。

我尝试使用*restrictindependent(如图所示)告诉编译器一切正常,但它忽略了我并且没有并行化循环。

通过适当使用restrict将指针作为参数传递给函数消除了错误,但我对这种设计有一种审美偏好。或者,所有方法(本质上都是内核)可以在run()函数中串联起来,但同样,这是不可取的。

如果我在内循环上使用independent,我会得到:

  

PGCC-W-0155-平铺/折叠循环嵌套的内部循环不应该有另一个循环指令(actual_code.cpp:227)

但是循环似乎并行化。

我正在编写PGI 17.9。

1 个答案:

答案 0 :(得分:1)

这里的问题是“height”和“width”是类数据成员。因此,编译器必须假定它们可能具有对它们的外部引用,因此可以在执行这些循环期间更改值。

解决方案是将值复制到局部变量,然后使用局部变量作为循环边界。

请注意,由于外循环上有“collapse(2)”,因此“independent”子句已经适用于两个循环。 (虽然“独立”是“并行”计算区域的默认值,因此不需要。)折叠多个循环时不允许使用第二个“循环”构造。

% cat test.cpp
#include <cstdlib>


class Model {
 private:
  int width;
  int height;
  int size;

  int    nshift[8];      //Offset from a focal cell's index to its neighbours
  double *restrict h;    //Digital elevation model (height)
  int    *restrict rec;  //Index of receiving cell

  const int NO_FLOW  = -1;
  const double SQRT2 = 1.414213562373095048801688724209698078569671875376948;
  const double dr[8] = {1,SQRT2,1,SQRT2,1,SQRT2,1,SQRT2};

 private:
  void GenerateRandomTerrain(){
    //srand(std::random_device()());
    for(int y=0;y<height;y++)
    for(int x=0;x<width;x++){
      const int c = y*width+x;
      h[c]  = rand()/(double)RAND_MAX;
    }
  }


 public:
  Model(const int width0, const int height0) : nshift{-1,-width0-1,-width0,-width0+1,1,width0+1,width0,width0-1}
  {
    width  = width0;
    height = height0;
    size   = width*height;

    h      = new double[size];

    GenerateRandomTerrain();
  }

  ~Model(){
    delete[] h;
  }

 private:
  void FindDownstream(){
    //! computing receiver array
    int hgt = height;
    int wdt = width;
    #pragma acc parallel loop collapse(2) present(h,rec)
    for(int y=2;y<hgt-2;y++)
    for(int x=2;x<wdt-2;x++){
      const int c      = y*wdt+x;

      //The slope must be greater than zero for there to be downhill flow;
      //otherwise, the cell is marekd NO_FLOW
      double max_slope = 0;
      int    max_n     = NO_FLOW;

      #pragma acc loop seq
      for(int n=0;n<8;n++){
        double slope = (h[c] - h[c+nshift[n]])/dr[n];
        if(slope>max_slope){
          max_slope = slope;
          max_n     = n;
        }
      }
      rec[c] = max_n;
    }
  }

 public:
  void run(const int nstep){
    rec    = new int[size];

    #pragma acc enter data copyin(this,h[0:size],nshift[0:8]) create(rec[0:size])

    for(int step=0;step<=nstep;step++)
      FindDownstream();

    #pragma acc exit data copyout(h[0:size]) delete(rec,nshift,this)

    delete[] rec;
  }

};

int main(int argc, char **argv){
  Model model(300,300);
  model.run(100);

  return 0;
}
% pgc++ test.cpp -w --c++11 -Minfo=accel -ta=tesla:cc60 -V17.10; a.out
Model::FindDownstream():
     49, Generating present(h[:])
         Accelerator kernel generated
         Generating Tesla code
         51, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
         52,   /* blockIdx.x threadIdx.x collapsed */
         61, #pragma acc loop seq
     49, Generating implicit copy(this[:])
         Generating present(rec[:])
     61, Loop carried scalar dependence for max_slope at line 63
Model::run(int):
     74, Generating enter data copyin(nshift[:],h[:size])
         Generating enter data create(rec[:size])
         Generating enter data copyin(this[:1])
     83, Generating exit data delete(this[:1],rec[:1])
         Generating exit data copyout(h[:size])
         Generating exit data delete(nshift[:])